lmdk-expt: Reviewed all graphs for synthetic

This commit is contained in:
Manos Katsomallos 2021-10-09 13:27:16 +02:00
parent edb98f736d
commit b03b510f02
12 changed files with 1839 additions and 10 deletions

121
code/expt/avg_dist.py Normal file
View File

@ -0,0 +1,121 @@
#!/usr/bin/env python3
import sys
sys.path.insert(1, '../lib')
import argparse
import gdp
import lmdk_lib
import math
from matplotlib import pyplot as plt
import numpy as np
import os
import time
def main(args):
# Number of timestamps
seq = lmdk_lib.get_seq(1, args.time)
# Distribution type
dist_type = np.array(range(0, 4))
# Number of landmarks
lmdk_n = np.array(range(0, args.time + 1, int(args.time/5)))
markers = [
'^', # Symmetric
'v', # Skewed
'D', # Bimodal
's' # Uniform
]
# Initialize plot
lmdk_lib.plot_init()
# The x axis
x_i = np.arange(len(lmdk_n))
plt.xticks(x_i, ((lmdk_n/len(seq))*100).astype(int))
plt.xlabel('Landmarks (%)') # Set x axis label.
plt.xlim(x_i.min(), x_i.max())
# The y axis
plt.ylabel('Normalized average distance') # Set y axis label.
plt.yscale('log')
plt.ylim(.001, 1)
# Logging
print('Average distance', end='', flush=True)
for d_i, d in enumerate(dist_type):
avg_dist = np.zeros(len(lmdk_n))
# Logging
print('.', end='', flush=True)
for i, n in enumerate(lmdk_n):
for r in range(args.reps):
# Generate landmarks
lmdks = lmdk_lib.get_lmdks(seq, n, d)
# Calculate average distance
avg_cur = 0
for t in seq:
t_prv, t_nxt = gdp.get_limits(t, seq, lmdks)
avg_cur += (abs(t - t_prv) - 1 + abs(t - t_nxt) - 1 )/len(seq)
# Normalized average based on repetitions
avg_dist[i] += avg_cur/args.reps
# Rescaling (min-max normalization)
# https://en.wikipedia.org/wiki/Feature_scaling#Rescaling_(min-max_normalization)
avg_dist = (avg_dist - avg_dist.min())/(avg_dist.max() - avg_dist.min())
# Normalize for log scale
if avg_dist[len(avg_dist) - 1] == 0:
avg_dist[len(avg_dist) - 1] = .001
# Set label
label = lmdk_lib.dist_type_to_str(d_i)
if d_i == 1:
label = 'Skewed'
# Plot line
plt.plot(
x_i,
avg_dist,
label=label,
marker=markers[d_i],
markersize=lmdk_lib.marker_size,
markeredgewidth=0,
linewidth=lmdk_lib.line_width
)
# Plot legend
lmdk_lib.plot_legend()
# Show plot
# plt.show()
# Save plot
lmdk_lib.save_plot(str('../../rslt/avg_dist/' + 'avg-dist' + '.pdf'))
print(' [OK]', flush=True)
'''
Parse arguments.
Optional:
reps - The number of repetitions.
time - The time limit of the sequence.
'''
def parse_args():
# Create argument parser.
parser = argparse.ArgumentParser()
# Mandatory arguments.
# Optional arguments.
parser.add_argument('-r', '--reps', help='The number of repetitions.', type=int, default=1)
parser.add_argument('-t', '--time', help='The time limit of the sequence.', type=int, default=100)
# Parse arguments.
args = parser.parse_args()
return args
if __name__ == '__main__':
try:
args = parse_args()
start_time = time.time()
main(args)
end_time = time.time()
print('##############################')
print('Time elapsed: %s' % (time.strftime('%H:%M:%S', time.gmtime(end_time - start_time))))
print('##############################')
except KeyboardInterrupt:
print('Interrupted by user.')
exit()

131
code/expt/dist_cor.py Normal file
View File

@ -0,0 +1,131 @@
#!/usr/bin/env python3
import sys
sys.path.insert(1, '../lib')
import argparse
import gdp
import itertools
import lmdk_bgt
import lmdk_lib
import numpy as np
import os
from matplotlib import pyplot as plt
import time
def main(args):
# Privacy goal
epsilon = 1.0
# Number of timestamps
seq = lmdk_lib.get_seq(1, args.time)
# Correlation degree (higher values means weaker correlations)
cor_deg = np.array([.01, .1, 1.0])
cor_lbl = ['Strong correlation', 'Moderate correlation', 'Weak correlation']
# Distribution type
dist_type = np.array(range(0, 4))
# Number of landmarks
lmdk_n = np.array(range(0, args.time + 1, int(args.time/5)))
# Width of bars
bar_width = 1/(len(dist_type) + 1)
# For each correlation degree
for c_i, c in enumerate(cor_deg):
# Logging
title = cor_lbl[c_i]
print('(%d/%d) %s' %(c_i + 1, len(cor_deg), title), end='', flush=True)
# The transition matrix
p = gdp.gen_trans_mt(2, c)
# Bar offset
x_offset = -(bar_width/2)*(len(dist_type) - 1)
# Initialize plot
lmdk_lib.plot_init()
# The x axis
x_i = np.arange(len(lmdk_n))
plt.xticks(x_i, ((lmdk_n/len(seq))*100).astype(int))
plt.xlabel('Landmarks (%)') # Set x axis label.
x_margin = bar_width*(len(dist_type)/2 + 1)
plt.xlim(x_i.min() - x_margin, x_i.max() + x_margin)
# The y axis
plt.ylabel('Privacy loss') # Set y axis label.
plt.yscale('log')
plt.ylim(epsilon/10, 100*len(seq))
# plt.ylim(0, 10000)
for d_i, d in enumerate(dist_type):
print('.', end='', flush=True)
# Initialization
e = np.zeros(len(lmdk_n))
a = np.zeros(len(lmdk_n))
for i, n in enumerate(lmdk_n):
for r in range(args.reps):
# Generate landmarks
lmdks = lmdk_lib.get_lmdks(seq, n, d)
# Uniform budget allocation
e_cur = lmdk_bgt.uniform(seq, lmdks, epsilon)
_, _, a_cur = gdp.tpl_lmdk_mem(e_cur, p, p, seq, lmdks)
# Save privacy loss
e[i] += np.sum(e_cur)/args.reps
a[i] += np.sum(a_cur)/args.reps
# Set label
label = lmdk_lib.dist_type_to_str(d_i)
if d_i == 1:
label = 'Skewed'
# Plot bar for current distribution
plt.bar(
x_i + x_offset,
a,
bar_width,
label=label,
linewidth=lmdk_lib.line_width
)
# Change offset for next bar
x_offset += bar_width
# Plot line for no correlation
plt.plot(
x_i,
e,
linewidth=lmdk_lib.line_width,
color='#e0e0e0',
)
# Plot legend
lmdk_lib.plot_legend()
# Show plot
# plt.show()
# Save plot
lmdk_lib.save_plot(str('../../rslt/dist_cor/' + title + '.pdf'))
print(' [OK]', flush=True)
'''
Parse arguments.
Optional:
reps - The number of repetitions.
time - The time limit of the sequence.
'''
def parse_args():
# Create argument parser.
parser = argparse.ArgumentParser()
# Mandatory arguments.
# Optional arguments.
parser.add_argument('-r', '--reps', help='The number of repetitions.', type=int, default=1)
parser.add_argument('-t', '--time', help='The time limit of the sequence.', type=int, default=100)
# Parse arguments.
args = parser.parse_args()
return args
if __name__ == '__main__':
try:
args = parse_args()
start_time = time.time()
main(args)
end_time = time.time()
print('##############################')
print('Time elapsed: %s' % (time.strftime('%H:%M:%S', time.gmtime(end_time - start_time))))
print('##############################')
except KeyboardInterrupt:
print('Interrupted by user.')
exit()

1576
code/lib/gdp.py Normal file

File diff suppressed because it is too large Load Diff

Binary file not shown.

BIN
rslt/avg_dist/avg-dist.pdf Normal file

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -51,7 +51,8 @@ In general, we can claim that the Adaptive is the most reliable and best perform
Moreover, designing a data-dependent sampling scheme would possibly result in better results for Adaptive. Moreover, designing a data-dependent sampling scheme would possibly result in better results for Adaptive.
\paragraph{Temporal distance and correlation} \subsubsection{Temporal distance and correlation}
Figure~\ref{fig:avg-dist} shows a comparison of the average temporal distance of the events from the previous/next {\thething} or the start/end of the time series for various distributions in synthetic data. Figure~\ref{fig:avg-dist} shows a comparison of the average temporal distance of the events from the previous/next {\thething} or the start/end of the time series for various distributions in synthetic data.
More particularly, we count for every event the total number of events between itself and the nearest {\thething} or the series edge. More particularly, we count for every event the total number of events between itself and the nearest {\thething} or the series edge.
We observe that the uniform and bimodal distributions tend to limit the regular event--{\thething} distance. We observe that the uniform and bimodal distributions tend to limit the regular event--{\thething} distance.
@ -61,33 +62,33 @@ On the contrary, distributing the {\thethings} at one part of the sequence, as i
\begin{figure}[htp] \begin{figure}[htp]
\centering \centering
\includegraphics[width=.5\linewidth]{avg-dist}% \includegraphics[width=.5\linewidth]{evaluation/avg-dist}%
\caption{Average temporal distance of the events from the {\thethings} for different {\thethings} percentages within a time series in various {\thethings} distributions.} \caption{Average temporal distance of the events from the {\thethings} for different {\thethings} percentages within a time series in various {\thethings} distributions.}
\label{fig:avg-dist} \label{fig:avg-dist}
\end{figure} \end{figure}
Figure~\ref{fig:dist-cor} illustrates a comparison among the aforementioned distributions regarding the overall privacy loss under moderate (Figure~\ref{fig:dist-cor-mod}), and strong (Figure~\ref{fig:dist-cor-stg}) correlation degrees. Figure~\ref{fig:dist-cor} illustrates a comparison among the aforementioned distributions regarding the overall privacy loss under (a)~weak, (b)~moderate, and (c)~strong temporal correlation degrees.
The line shows the overall privacy loss---for all cases of {\thethings} distribution---without temporal correlation. The line shows the overall privacy loss---for all cases of {\thethings} distribution---without temporal correlation.
We skip the presentation of the results under a weak correlation degree, since they converge in this case. In combination with Figure~\ref{fig:avg-dist}, we conclude that a greater average event--{\thething} distance in a distribution can result into greater overall privacy loss under moderate and strong temporal correlation.
In combination with Figure~\ref{fig:avg-dist}, we conclude that a greater average event-{\thething} distance in a distribution can result into greater overall privacy loss under moderate and strong temporal correlation. This is due to the fact that the backward/forward privacy loss accumulates more over time in wider spaces without {\thethings} (see Section~\ref{sec:correlation}).
This is due to the fact that the backward/forward privacy loss accumulates more over time in wider spaces without {\thethings} (see Section~\ref{subsec:correlations}).
Furthermore, the behavior of the privacy loss is as expected regarding the temporal correlation degree. Furthermore, the behavior of the privacy loss is as expected regarding the temporal correlation degree.
Predictably, a stronger correlation degree generates higher privacy loss while widening the gap between the different distribution cases. Predictably, a stronger correlation degree generates higher privacy loss while widening the gap between the different distribution cases.
On the contrary, a weaker correlation degree makes it harder to differentiate among the {\thethings} distributions. On the contrary, a weaker correlation degree makes it harder to differentiate among the {\thethings} distributions.
The privacy loss under a weak correlation degree converge.
\begin{figure}[htp] \begin{figure}[htp]
\centering \centering
\subcaptionbox{Weak correlation\label{fig:dist-cor-wk}}{% \subcaptionbox{Weak correlation\label{fig:dist-cor-wk}}{%
\includegraphics[width=.5\linewidth]{dist-cor-wk}% \includegraphics[width=.5\linewidth]{evaluation/dist-cor-wk}%
}% }%
\hspace{\fill} \hspace{\fill}
\subcaptionbox{Moderate correlation\label{fig:dist-cor-mod}}{% \subcaptionbox{Moderate correlation\label{fig:dist-cor-mod}}{%
\includegraphics[width=.5\linewidth]{dist-cor-mod}% \includegraphics[width=.5\linewidth]{evaluation/dist-cor-mod}%
}% }%
\subcaptionbox{Strong correlation\label{fig:dist-cor-stg}}{% \subcaptionbox{Strong correlation\label{fig:dist-cor-stg}}{%
\includegraphics[width=.5\linewidth]{dist-cor-stg}% \includegraphics[width=.5\linewidth]{evaluation/dist-cor-stg}%
}% }%
\caption{Privacy loss for different {\thethings} percentages and distributions, under weak, moderate, and strong degrees of temporal correlation. \caption{Privacy loss for different {\thethings} percentages and distributions, under (a)~weak, (b)~moderate, and (c)~strong degrees of temporal correlation.
The line shows the overall privacy loss without temporal correlation.} The line shows the overall privacy loss without temporal correlation.}
\label{fig:dist-cor} \label{fig:dist-cor}
\end{figure} \end{figure}