Merge branch 'master' of git.delkappa.com:manos/the-last-thing

This commit is contained in:
Manos Katsomallos 2021-10-11 18:33:11 +02:00
commit eccc28748a
27 changed files with 188 additions and 84 deletions

View File

@ -68,29 +68,30 @@ def main(args):
for _ in range(args.iter): for _ in range(args.iter):
lmdks, eps_out = lmdk_sel.find_lmdks(seq, lmdks, epsilon) lmdks_sel, eps_out = lmdk_sel.find_lmdks(seq, lmdks, epsilon)
# Skip # Skip
rls_data_s, bgts_s = lmdk_bgt.skip_cont(seq, lmdks, eps_out) rls_data_s, bgts_s = lmdk_bgt.skip_cont(seq, lmdks_sel, eps_out)
# lmdk_bgt.validate_bgts(seq, lmdks, epsilon, bgts_s) # lmdk_bgt.validate_bgts(seq, lmdks, epsilon, bgts_s)
mae_s[i] += (lmdk_bgt.mae_cont(rls_data_s)/args.iter)*100 mae_s[i] += (lmdk_bgt.mae_cont(rls_data_s)/args.iter)*100
# Uniform # Uniform
rls_data_u, bgts_u = lmdk_bgt.uniform_cont(seq, lmdks, eps_out) rls_data_u, bgts_u = lmdk_bgt.uniform_cont(seq, lmdks_sel, eps_out)
# lmdk_bgt.validate_bgts(seq, lmdks, epsilon, bgts_u) # lmdk_bgt.validate_bgts(seq, lmdks, epsilon, bgts_u)
mae_u[i] += (lmdk_bgt.mae_cont(rls_data_u)/args.iter)*100 mae_u[i] += (lmdk_bgt.mae_cont(rls_data_u)/args.iter)*100
# Adaptive # Adaptive
rls_data_a, _, _ = lmdk_bgt.adaptive_cont(seq, lmdks, eps_out, .5, .5) rls_data_a, _, _ = lmdk_bgt.adaptive_cont(seq, lmdks_sel, eps_out, .5, .5)
mae_a[i] += (lmdk_bgt.mae_cont(rls_data_a)/args.iter)*100 mae_a[i] += (lmdk_bgt.mae_cont(rls_data_a)/args.iter)*100
# Calculate once # Calculate once
if i == 0: if pct == lmdks_pct[0]:
# Event # Event
rls_data_evt, _ = lmdk_bgt.uniform_cont(seq, lmdk_lib.find_lmdks_cont(lmdk_data, seq, uid, 0), epsilon) rls_data_evt, _ = lmdk_bgt.uniform_cont(seq, lmdks, epsilon)
mae_evt += (lmdk_bgt.mae_cont(rls_data_evt)/args.iter)*100 mae_evt += (lmdk_bgt.mae_cont(rls_data_evt)/args.iter)*100
elif pct == lmdks_pct[-1]:
# User # User
rls_data_usr, _ = lmdk_bgt.uniform_cont(seq, lmdk_lib.find_lmdks_cont(lmdk_data, seq, uid, 100), epsilon) rls_data_usr, _ = lmdk_bgt.uniform_cont(seq, lmdks, epsilon)
mae_usr += (lmdk_bgt.mae_cont(rls_data_usr)/args.iter)*100 mae_usr += (lmdk_bgt.mae_cont(rls_data_usr)/args.iter)*100
plt.axhline( plt.axhline(

View File

@ -80,12 +80,13 @@ def main(args):
mae_a[i] += (lmdk_bgt.mae_cont(rls_data_a)/args.iter)*100 mae_a[i] += (lmdk_bgt.mae_cont(rls_data_a)/args.iter)*100
# Calculate once # Calculate once
if i == 0: if pct == lmdks_pct[0]:
# Event # Event
rls_data_evt, _ = lmdk_bgt.uniform_cont(seq, lmdk_lib.find_lmdks_cont(lmdk_data, seq, uid, 0), epsilon) rls_data_evt, _ = lmdk_bgt.uniform_cont(seq, lmdks, epsilon)
mae_evt += (lmdk_bgt.mae_cont(rls_data_evt)/args.iter)*100 mae_evt += (lmdk_bgt.mae_cont(rls_data_evt)/args.iter)*100
elif pct == lmdks_pct[-1]:
# User # User
rls_data_usr, _ = lmdk_bgt.uniform_cont(seq, lmdk_lib.find_lmdks_cont(lmdk_data, seq, uid, 100), epsilon) rls_data_usr, _ = lmdk_bgt.uniform_cont(seq, lmdks, epsilon)
mae_usr += (lmdk_bgt.mae_cont(rls_data_usr)/args.iter)*100 mae_usr += (lmdk_bgt.mae_cont(rls_data_usr)/args.iter)*100
plt.axhline( plt.axhline(

View File

@ -48,7 +48,7 @@ def main(args):
# The y axis # The y axis
plt.ylabel('Mean absolute error (kWh)') # Set y axis label. plt.ylabel('Mean absolute error (kWh)') # Set y axis label.
plt.yscale('log') plt.yscale('log')
plt.ylim(.1, 10000) plt.ylim(.1, 100000)
# Bar offset # Bar offset
x_offset = -(bar_width/2)*(n - 1) x_offset = -(bar_width/2)*(n - 1)
@ -80,13 +80,13 @@ def main(args):
mae_a[i] += lmdk_bgt.mae_cons(seq, rls_data_a)/args.iter mae_a[i] += lmdk_bgt.mae_cons(seq, rls_data_a)/args.iter
# Calculate once # Calculate once
# Event if pct == lmdks_pct[0]:
if i == 0: # Event
rls_data_evt, _ = lmdk_bgt.uniform_cons(seq, seq[seq[:, 1] < lmdks_th[0]], epsilon) rls_data_evt, _ = lmdk_bgt.uniform_cons(seq, lmdks, epsilon)
mae_evt += lmdk_bgt.mae_cons(seq, rls_data_evt)/args.iter mae_evt += lmdk_bgt.mae_cons(seq, rls_data_evt)/args.iter
# User elif pct == lmdks_pct[-1]:
if i == 0: # User
rls_data_usr, _ = lmdk_bgt.uniform_cons(seq, seq[seq[:, 1] < lmdks_th[len(lmdks_th)-1]], epsilon) rls_data_usr, _ = lmdk_bgt.uniform_cons(seq, lmdks, epsilon)
mae_usr += lmdk_bgt.mae_cons(seq, rls_data_usr)/args.iter mae_usr += lmdk_bgt.mae_cons(seq, rls_data_usr)/args.iter
plt.axhline( plt.axhline(

View File

@ -46,7 +46,7 @@ def main(args):
# The y axis # The y axis
plt.ylabel('Mean absolute error (kWh)') # Set y axis label. plt.ylabel('Mean absolute error (kWh)') # Set y axis label.
plt.yscale('log') plt.yscale('log')
plt.ylim(.1, 10000) plt.ylim(.1, 100000)
# Bar offset # Bar offset
x_offset = -(bar_width/2)*(n - 1) x_offset = -(bar_width/2)*(n - 1)
@ -75,13 +75,13 @@ def main(args):
mae_a[i] += lmdk_bgt.mae_cons(seq, rls_data_a)/args.iter mae_a[i] += lmdk_bgt.mae_cons(seq, rls_data_a)/args.iter
# Calculate once # Calculate once
# Event if pct == lmdks_pct[0]:
if i == 0: # Event
rls_data_evt, _ = lmdk_bgt.uniform_cons(seq, seq[seq[:, 1] < lmdks_th[0]], epsilon) rls_data_evt, _ = lmdk_bgt.uniform_cons(seq, lmdks, epsilon)
mae_evt += lmdk_bgt.mae_cons(seq, rls_data_evt)/args.iter mae_evt += lmdk_bgt.mae_cons(seq, rls_data_evt)/args.iter
# User elif pct == lmdks_pct[-1]:
if i == 0: # User
rls_data_usr, _ = lmdk_bgt.uniform_cons(seq, seq[seq[:, 1] < lmdks_th[len(lmdks_th)-1]], epsilon) rls_data_usr, _ = lmdk_bgt.uniform_cons(seq, lmdks, epsilon)
mae_usr += lmdk_bgt.mae_cons(seq, rls_data_usr)/args.iter mae_usr += lmdk_bgt.mae_cons(seq, rls_data_usr)/args.iter
plt.axhline( plt.axhline(

View File

@ -20,7 +20,15 @@ def main(args):
# Distribution type # Distribution type
dist_type = np.array(range(0, 4)) dist_type = np.array(range(0, 4))
# Number of landmarks # Number of landmarks
lmdk_n = np.array(range(int(.2*args.time), args.time, int(args.time/5))) lmdk_n = np.array(range(0, args.time + 1, int(args.time/5)))
markers = [
'^', # Symmetric
'v', # Skewed
'D', # Bimodal
's' # Uniform
]
# Initialize plot # Initialize plot
lmdk_lib.plot_init() lmdk_lib.plot_init()
# Width of bars # Width of bars
@ -30,11 +38,13 @@ def main(args):
x_margin = bar_width*(len(dist_type)/2 + 1) x_margin = bar_width*(len(dist_type)/2 + 1)
plt.xticks(x_i, ((lmdk_n/len(seq))*100).astype(int)) plt.xticks(x_i, ((lmdk_n/len(seq))*100).astype(int))
plt.xlabel('Landmarks (%)') # Set x axis label. plt.xlabel('Landmarks (%)') # Set x axis label.
plt.xlim(x_i.min() - x_margin, x_i.max() + x_margin) # plt.xlim(x_i.min() - x_margin, x_i.max() + x_margin)
plt.xlim(x_i.min(), x_i.max())
# The y axis # The y axis
# plt.yscale('log') # plt.yscale('log')
plt.ylabel('Euclidean distance') # Set y axis label. plt.ylim(0, 1)
# plt.ylabel('Wasserstein distance') # Set y axis label. plt.ylabel('Normalized Euclidean distance') # Set y axis label.
# plt.ylabel('Normalized Wasserstein distance') # Set y axis label.
# Bar offset # Bar offset
x_offset = -(bar_width/2)*(len(dist_type) - 1) x_offset = -(bar_width/2)*(len(dist_type) - 1)
for d_i, d in enumerate(dist_type): for d_i, d in enumerate(dist_type):
@ -47,27 +57,41 @@ def main(args):
print('(%d/%d) %s... ' %(d_i + 1, len(dist_type), title), end='', flush=True) print('(%d/%d) %s... ' %(d_i + 1, len(dist_type), title), end='', flush=True)
mae = np.zeros(len(lmdk_n)) mae = np.zeros(len(lmdk_n))
for n_i, n in enumerate(lmdk_n): for n_i, n in enumerate(lmdk_n):
for r in range(args.reps): if n == lmdk_n[-1]:
break
for r in range(args.iter):
lmdks = lmdk_lib.get_lmdks(seq, n, d) lmdks = lmdk_lib.get_lmdks(seq, n, d)
hist, h = lmdk_lib.get_hist(seq, lmdks) hist, h = lmdk_lib.get_hist(seq, lmdks)
opts = lmdk_sel.get_opts_from_top_h(seq, lmdks) opts = lmdk_sel.get_opts_from_top_h(seq, lmdks)
delta = 1.0 delta = 1.0
res, _ = exp_mech.exponential(hist, opts, exp_mech.score, delta, epsilon) res, _ = exp_mech.exponential(hist, opts, exp_mech.score, delta, epsilon)
mae[n_i] += lmdk_lib.get_norm(hist, res)/args.reps # Euclidean mae[n_i] += lmdk_lib.get_norm(hist, res)/args.iter # Euclidean
# mae[n_i] += lmdk_lib.get_emd(hist, res)/args.reps # Wasserstein # mae[n_i] += lmdk_lib.get_emd(hist, res)/args.iter # Wasserstein
mae = mae/21 # Euclidean
# mae = mae/11.75 # Wasserstein
print('[OK]', flush=True) print('[OK]', flush=True)
# Plot bar for current distribution # # Plot bar for current distribution
plt.bar( # plt.bar(
x_i + x_offset, # x_i + x_offset,
# mae,
# bar_width,
# label=label,
# linewidth=lmdk_lib.line_width
# )
# # Change offset for next bar
# x_offset += bar_width
# Plot line
plt.plot(
x_i,
mae, mae,
bar_width,
label=label, label=label,
marker=markers[d_i],
markersize=lmdk_lib.marker_size,
markeredgewidth=0,
linewidth=lmdk_lib.line_width linewidth=lmdk_lib.line_width
) )
# Change offset for next bar path = str('../../rslt/lmdk_sel_cmp/' + 'lmdk_sel_cmp-norm-l')
x_offset += bar_width # path = str('../../rslt/lmdk_sel_cmp/' + 'lmdk_sel_cmp-emd-l')
path = str('../../rslt/lmdk_sel_cmp/' + 'lmdk_sel_cmp-norm')
# path = str('../../rslt/lmdk_sel_cmp/' + 'lmdk_sel_cmp-emd')
# Plot legend # Plot legend
lmdk_lib.plot_legend() lmdk_lib.plot_legend()
# Show plot # Show plot
@ -81,7 +105,7 @@ def main(args):
Parse arguments. Parse arguments.
Optional: Optional:
reps - The number of repetitions. iter - The number of iterations.
time - The time limit of the sequence. time - The time limit of the sequence.
''' '''
def parse_args(): def parse_args():
@ -91,7 +115,7 @@ def parse_args():
# Mandatory arguments. # Mandatory arguments.
# Optional arguments. # Optional arguments.
parser.add_argument('-r', '--reps', help='The number of repetitions.', type=int, default=1) parser.add_argument('-i', '--iter', help='The number of iterations.', type=int, default=1)
parser.add_argument('-t', '--time', help='The time limit of the sequence.', type=int, default=100) parser.add_argument('-t', '--time', help='The time limit of the sequence.', type=int, default=100)
# Parse arguments. # Parse arguments.

View File

@ -70,7 +70,7 @@ def main(args):
# The y axis # The y axis
plt.ylabel('Mean absolute error (m)') # Set y axis label. plt.ylabel('Mean absolute error (m)') # Set y axis label.
plt.yscale('log') plt.yscale('log')
# plt.ylim(1, 100000000) plt.ylim(1, 1000000)
# Bar offset # Bar offset
x_offset = -(bar_width/2)*(n - 1) x_offset = -(bar_width/2)*(n - 1)
@ -101,12 +101,13 @@ def main(args):
rls_data_a, _, _ = lmdk_bgt.adaptive(seq, lmdks, eps_out, .5, .5) rls_data_a, _, _ = lmdk_bgt.adaptive(seq, lmdks, eps_out, .5, .5)
mae_a[i] += lmdk_bgt.mae(seq, rls_data_a)/args.iter mae_a[i] += lmdk_bgt.mae(seq, rls_data_a)/args.iter
# Event # Calculate once
if lmdk == 0: if lmdk == min(data_info[d]['lmdks']):
# Event
rls_data_evt, _ = lmdk_bgt.uniform_r(seq, lmdks, bgt['epsilon']) rls_data_evt, _ = lmdk_bgt.uniform_r(seq, lmdks, bgt['epsilon'])
mae_evt += lmdk_bgt.mae(seq, rls_data_evt)/args.iter mae_evt += lmdk_bgt.mae(seq, rls_data_evt)/args.iter
# User elif lmdk == max(data_info[d]['lmdks']):
if lmdk == 100: # User
rls_data_usr, _ = lmdk_bgt.uniform_r(seq, lmdks, bgt['epsilon']) rls_data_usr, _ = lmdk_bgt.uniform_r(seq, lmdks, bgt['epsilon'])
mae_usr += lmdk_bgt.mae(seq, rls_data_usr)/args.iter mae_usr += lmdk_bgt.mae(seq, rls_data_usr)/args.iter

View File

@ -68,7 +68,7 @@ def main(args):
# The y axis # The y axis
plt.ylabel('Mean absolute error (m)') # Set y axis label. plt.ylabel('Mean absolute error (m)') # Set y axis label.
plt.yscale('log') plt.yscale('log')
# plt.ylim(1, 100000000) plt.ylim(1, 1000000)
# Bar offset # Bar offset
x_offset = -(bar_width/2)*(n - 1) x_offset = -(bar_width/2)*(n - 1)
@ -103,12 +103,13 @@ def main(args):
# mae_d[i] += lmdk_bgt.mae(seq, rls_data_a)/args.iter # mae_d[i] += lmdk_bgt.mae(seq, rls_data_a)/args.iter
# s_d += s_d_c/args.iter # s_d += s_d_c/args.iter
# Event # Calculate once
if lmdk == 0: if lmdk == min(data_info[d]['lmdks']):
# Event
rls_data_evt, _ = lmdk_bgt.uniform_r(seq, lmdks, bgt['epsilon']) rls_data_evt, _ = lmdk_bgt.uniform_r(seq, lmdks, bgt['epsilon'])
mae_evt += lmdk_bgt.mae(seq, rls_data_evt)/args.iter mae_evt += lmdk_bgt.mae(seq, rls_data_evt)/args.iter
# User elif lmdk == max(data_info[d]['lmdks']):
if lmdk == 100: # User
rls_data_usr, _ = lmdk_bgt.uniform_r(seq, lmdks, bgt['epsilon']) rls_data_usr, _ = lmdk_bgt.uniform_r(seq, lmdks, bgt['epsilon'])
mae_usr += lmdk_bgt.mae(seq, rls_data_usr)/args.iter mae_usr += lmdk_bgt.mae(seq, rls_data_usr)/args.iter

View File

@ -558,10 +558,10 @@ def skip_cont(seq, lmdks, epsilon):
# Add noise # Add noise
o = lmdk_lib.randomized_response(is_landmark, bgts[i]) o = lmdk_lib.randomized_response(is_landmark, bgts[i])
if is_landmark: if is_landmark:
bgts[i] = 0
if i > 0: if i > 0:
# Approximate with previous # Approximate with previous
o = rls_data[i - 1][1] o = rls_data[i - 1][1]
bgts[i] = 0
rls_data[i] = [is_landmark, o] rls_data[i] = [is_landmark, o]
return rls_data, bgts return rls_data, bgts

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -43,7 +43,7 @@ We take into account only the temporal order of the points and the position of r
\subsection{Configurations} \subsection{Configurations}
\label{subsec:eval-conf} \label{subsec:eval-conf}
\subsubsection{{\Thethings}' percentage} \subsubsection{{\Thething} percentage}
For the Copenhagen data set, we achieve For the Copenhagen data set, we achieve
$0\%$ {\thethings} by considering an empty list of contact devices, $0\%$ {\thethings} by considering an empty list of contact devices,
@ -53,18 +53,22 @@ $60\%$ with $[181$, $182$, $192$, $195$, $196$, $201$, $203$, $207$, $221$, $230
$80\%$ with $[260$, $282$, $287$, $289$, $290$, $291$, $308$, $311$, $318$, $323$, $324$, $330$, $334$, $335$, $344$, $350$, $353$, $355$, $357$, $358$, $361$, $363]$, and $80\%$ with $[260$, $282$, $287$, $289$, $290$, $291$, $308$, $311$, $318$, $323$, $324$, $330$, $334$, $335$, $344$, $350$, $353$, $355$, $357$, $358$, $361$, $363]$, and
$100\%$ by including all of the possible contacts. $100\%$ by including all of the possible contacts.
In HUE, we get $0$, $20$ $40$, $60$, $80$, and $100$ {\thethings} percentages by setting the energy consumption threshold below $0.28$, $1.12$, $0.88$, $0.68$, $0.54$, $4.45$kWh respectively. In HUE, we get $0$\%, $20$\% $40$\%, $60$\%, $80$\%, and $100$\% {\thethings} by setting the energy consumption threshold below $0.28$kWh, $1.12$kWh, $0.88$kWh, $0.68$kWh, $0.54$kWh, $4.45$kWh respectively.
In T-drive, we achieved the desired {\thethings} percentages by utilizing the method of Li et al.~\cite{li2008mining} for detecting stay points in trajectory data. In T-drive, we achieved the desired {\thething} percentages by utilizing the method of Li et al.~\cite{li2008mining} for detecting stay points in trajectory data.
In more detail, the algorithm checks for each data item if each subsequent item is within a given distance threshold $\Delta l$ and measures the time period $\Delta t$ between the present point and the last subsequent point. In more detail, the algorithm checks for each data item if each subsequent item is within a given distance threshold $\Delta l$ and measures the time period $\Delta t$ between the present point and the last subsequent point.
We achieve $0$, $20$ $40$, $60$, $80$, and $100$ {\thethings} percentages by setting the ($\Delta l$ in meters, $\Delta t$ in minutes) pairs input to the stay point discovery method as [($0$, $1000$), ($2095$, $30$), ($2790$, $30$), ($3590$, $30$), ($4825$, $30$), ($10350$, $30$)]. We achieve $0$\%, $20$\% $40$\%, $60$\%, $80$\%, and $100$\% {\thethings} by setting the ($\Delta l$ in meters, $\Delta t$ in minutes) pairs input to the stay point discovery method as [($0$, $1000$), ($2095$, $30$), ($2790$, $30$), ($3590$, $30$), ($4825$, $30$), ($10350$, $30$)].
We generated synthetic data with \emph{skewed} (the {\thethings} are distributed towards the beginning/end of the series), \emph{symmetric} (in the middle), \emph{bimodal} (both end and beginning), and \emph{uniform} (all over the time series) {\thething} distributions. We generated synthetic data with \emph{skewed} (the {\thethings} are distributed towards the beginning/end of the series), \emph{symmetric} (in the middle), \emph{bimodal} (both end and beginning), and \emph{uniform} (all over the time series) {\thething} distributions.
In order to get {\thethings} with the above distribution features, we generate probability distributions with appropriate characteristics and sample from them, without replacement, the desired number of points. In order to get {\thethings} with the above distribution features, we generate probability distributions with appropriate characteristics and sample from them, without replacement, the desired number of points.
%The generated distributions are representative of the cases that we wish to examine during the experiments. %The generated distributions are representative of the cases that we wish to examine during the experiments.
For example, for a left-skewed {\thethings} distribution we would utilize a truncated distribution resulting from the restriction of the domain of a distribution to the beginning and end of the time series with its location shifted to the center of the right half of the series. For example, for a left-skewed {\thething} distribution we would utilize a truncated distribution resulting from the restriction of the domain of a distribution to the beginning and end of the time series with its location shifted to the center of the right half of the series.
For consistency, we calculate the scale parameter depending on the length of the series by setting it equal to the series' length over a constant. For consistency, we calculate the scale parameter depending on the length of the series by setting it equal to the series' length over a constant.
Notice that in our experiments, in the cases when we have $0\%$ and $100\%$ of the events being {\thethings}, we get the same behavior as in event- and user-level privacy respectively.
This happens due the fact that at each timestamp we take into account only the data items at the current timestamp and ignore the rest of the time series (event-level) when there are no {\thethings}.
Whereas, when each timestamp corresponds to a {\thething} we consider and protect all the events throughout the entire series (user-level).
\subsubsection{Privacy parameters} \subsubsection{Privacy parameters}

View File

@ -1,6 +1,13 @@
\chapter{Evaluation} \chapter{Evaluation}
\label{ch:eval} \label{ch:eval}
In this chapter we present the experiments that we performed, to evaluate the methodology that we introduced in chapter~\ref{ch:lmdk-prv}, on real and synthetic data sets.
Section~\ref{sec:eval-dtl} contains all the details regarding the data sets the we utilized for our experiments (Section~\ref{subsec:eval-dat}) along with the parameter configurations.
Section~\ref{sec:eval-lmdk} evaluates the data utility of the {\thething} privacy mechanisms that we designed in Section~\ref{sec:thething} and investigates the behavior of the privacy loss under temporal correlation for different distributions of {\thethings}.
Section~\ref{sec:eval-lmdk-sel} justifies our decisions while designing the privacy-preserving {\thething} selection component in Section~\ref{sec:theotherthing} and the data utility impact of the latter.
Finally, Section~\ref{sec:eval-sum} concludes this chapter by summarizing the main takeaways of the results of the experiments that we performed.
\input{evaluation/details} \input{evaluation/details}
\input{evaluation/thething} \input{evaluation/thething}
\input{evaluation/theotherthing} \input{evaluation/theotherthing}
\input{evaluation/summary}

View File

@ -0,0 +1,8 @@
\section{Summary}
\label{sec:eval-sum}
In this chapter we presented the experimental evaluation of the {\thething} privacy mechanisms and privacy-preserving {\thething} selection mechanism, that we developed in chapter~\ref{ch:lmdk-prv}, on real and synthetic data sets.
The Adaptive mechanism is the most reliable and best performing mechanism, in terms of overall data utility, with minimal tuning across most cases.
Skip performs optimally in data sets with a lower value range where approximation fits best.
The {\thething} selection component introduces a reasonable data utility decline to all of our mechanisms however, the Adaptive handles it well and bounds the data utility to higher levels compared to user-level protection.
In terms of temporal correlation, we observe that under moderate and strong temporal correlation, a greater average regular--{\thething} event distance in a {\thething} distribution causes greater overall privacy loss.

View File

@ -1,2 +1,61 @@
\section{Selection of events} \section{Selection of events}
\label{sec:lmdk-sel-eval} \label{sec:eval-lmdk-sel}
In this section, we present the experiments that we performed, to test the methodology that we presented in Section~\ref{subsec:lmdk-sel-sol}, on real and synthetic data sets.
With the experiments on the synthetic data sets (Section~\ref{subsec:sel-utl}) we show the normalized Euclidean and Wasserstein distances of the time series histograms for various distributions and {\thething} percentages.
This allows us to justify our design decisions for our concept that we showcased in Section~\ref{subsec:lmdk-sel-sol}.
With the experiments on the real data sets (Section~\ref{subsec:sel-prv}), we show the performance in terms of utility of our three {\thething} mechanisms in combination with the privacy preserving {\thething} selection component.
\subsection{{\Thething} selection utility metrics}
\label{subsec:sel-utl}
Figure~\ref{fig:sel-dist} demonstrates the normalized distance that we obtain when we utilize either (a)~the Euclidean or (b)~the Wasserstein distance metric to obtain a set of {\thethings} including regular events.
\begin{figure}[htp]
\centering
\subcaptionbox{Euclidean\label{fig:sel-dist-norm}}{%
\includegraphics[width=.5\linewidth]{evaluation/sel-dist-norm}%
}%
\subcaptionbox{Wasserstein\label{fig:sel-dist-emd}}{%
\includegraphics[width=.5\linewidth]{evaluation/sel-dist-emd}%
}%
\caption{The normalized (a)~Euclidean, and (b)~Wasserstein distance of the generated {\thething} sets for different {\thething} percentages.}
\label{fig:sel-dist}
\end{figure}
Comparing the results of the Euclidean distance in Figure~\ref{fig:sel-dist-norm} with those of the Wasserstein in Figure~\ref{fig:sel-dist-emd} we conclude that the Euclidean distance provides more consistent results for all possible distributions.
% (0 + (0.25 + 0.25 + 0.3 + 0.3)/4 + (0.45 + 0.45 + 0.45 + 0.5)/4 + (0.5 + 0.5 + 0.7 + 0.7)/4 + (0.6 + 0.6 + 1 + 1)/4 + (0.3 + 0.3 + 0.3 + 0.3)/4)/6
% (0 + (0.15 + 0.15 + 0.15 + 0.15)/4 + (0.2 + 0.2 + 0.3 + 0.4)/4 + (0.3 + 0.3 + 0.6 + 0.6)/4 + (0.3 + 0.3 + 1 + 1)/4 + (0.05 + 0.05 + 0.05 + 0.05)/4)
The maximum difference is approximately $0.4$ for the former and $0.7$ for the latter between the bimodal and skewed {\thething} distribution.
While both methods share the same mean normalized distance of $0.4$, the Euclidean distance demonstrates a more consistent performance among all possible {\thething} distributions.
Therefore, we choose to utilize the Euclidean distance metric for the implementation of the privacy-preserving {\thething} selection in Section~\ref{subsec:lmdk-sel-sol}.
\subsection{Budget allocation and {\thething} selection}
\label{subsec:sel-prv}
Figure~\ref{fig:real-sel} exhibits the performance of Skip, Uniform, and Adaptive (see Section~\ref{subsec:lmdk-mechs}) in combination with the {\thething} selection component.
\begin{figure}[htp]
\centering
\subcaptionbox{Copenhagen\label{fig:copenhagen-sel}}{%
\includegraphics[width=.5\linewidth]{evaluation/copenhagen-sel}%
}%
\hspace{\fill}
\subcaptionbox{HUE\label{fig:hue-sel}}{%
\includegraphics[width=.5\linewidth]{evaluation/hue-sel}%
}%
\subcaptionbox{T-drive\label{fig:t-drive-sel}}{%
\includegraphics[width=.5\linewidth]{evaluation/t-drive-sel}%
}%
\caption{The mean absolute error (a)~as a percentage, (b)~in kWh, and (c)~in meters of the released data for different {\thething} percentages.}
\label{fig:real-sel}
\end{figure}
In comparison with the utility performance without the {\thething} selection component (Figure~\ref{fig:real}), we notice a slight deterioration for all three models.
This is natural since we allocated part of the available privacy budget to the privacy-preserving {\thething} selection component which in turn increased the number of {\thethings}.
Therefore, there is less privacy budget available for data publishing throughout the time series for $0$\% and $100$\% {\thethings}.
Skip performs best in our experiments with HUE, due to the low range in the energy consumption and the high scale of the Laplace noise which it avoids due to its tendency to approximate.
However, for the Copenhagen data set and T-drive it attains greater mean absolute error than the user-level protection scheme.
Overall, Adaptive has a consistent performance in terms of utility for all of the data sets that we experimented with.

View File

@ -1,16 +1,11 @@
\section{Significant events} \section{Significant events}
\label{sec:lmdk-eval} \label{sec:eval-lmdk}
% \kat{After discussing with Dimitris, I thought you are keeping one chapter for the proposals of the thesis. In this case, it would be more clean to keep the theoretical contributions in one chapter and the evaluation in a separate chapter. } % \kat{After discussing with Dimitris, I thought you are keeping one chapter for the proposals of the thesis. In this case, it would be more clean to keep the theoretical contributions in one chapter and the evaluation in a separate chapter. }
% \mk{OK.} % \mk{OK.}
In this section we present the experiments that we performed on real and synthetic data sets. In this section, we present the experiments that we performed, to test the methodology that we presented in Section~\ref{subsec:lmdk-sol}, on real and synthetic data sets.
With the experiments on the real data sets (Section~\ref{subsec:lmdk-expt-bgt}), we show the performance in terms of utility of our three {\thething} mechanisms. With the experiments on the real data sets (Section~\ref{subsec:lmdk-expt-bgt}), we show the performance in terms of utility of our three {\thething} mechanisms.
With the experiments on the synthetic data sets (Section~\ref{subsec:lmdk-expt-cor}) we show the privacy loss by our framework when tuning the size and statistical characteristics of the input {\thething} set $L$ with special emphasis on how the privacy loss under temporal correlation is affected by the number and distribution of the {\thethings}. With the experiments on the synthetic data sets (Section~\ref{subsec:lmdk-expt-cor}) we show the privacy loss by our framework when tuning the size and statistical characteristics of the input {\thething} set $L$ with special emphasis on how the privacy loss under temporal correlation is affected by the number and distribution of the {\thethings}.
Notice that in our experiments, in the cases when we have $0\%$ and $100\%$ of the events being {\thethings}, we get the same behavior as in event- and user-level privacy respectively.
This happens due the fact that at each timestamp we take into account only the data items at the current timestamp and ignore the rest of the time series (event-level) when there are no {\thethings}.
Whereas, when each timestamp corresponds to a {\thething} we consider and protect all the events throughout the entire series (user-level).
\subsection{Budget allocation schemes} \subsection{Budget allocation schemes}
@ -30,20 +25,21 @@ Figure~\ref{fig:real} exhibits the performance of the three mechanisms: Skip, Un
\subcaptionbox{T-drive\label{fig:t-drive}}{% \subcaptionbox{T-drive\label{fig:t-drive}}{%
\includegraphics[width=.5\linewidth]{evaluation/t-drive}% \includegraphics[width=.5\linewidth]{evaluation/t-drive}%
}% }%
\caption{The mean absolute error (a)~as a percentage, (b)~in kWh, and (c)~in meters of the released data for different {\thethings} percentages.} \caption{The mean absolute error (a)~as a percentage, (b)~in kWh, and (c)~in meters of the released data for different {\thething} percentages.}
\label{fig:real} \label{fig:real}
\end{figure} \end{figure}
% For the Geolife data set (Figure~\ref{fig:geolife}), Skip has the best performance (measured in Mean Absolute Error, in meters) because it invests the most budget overall at every regular event, by approximating the {\thething} data based on previous releases. % For the Geolife data set (Figure~\ref{fig:geolife}), Skip has the best performance (measured in Mean Absolute Error, in meters) because it invests the most budget overall at every regular event, by approximating the {\thething} data based on previous releases.
% Due to the data set's high density (every $1$--$5$ seconds or every $5$--$10$ meters per point) approximating constantly has a low impact on the data utility. % Due to the data set's high density (every $1$--$5$ seconds or every $5$--$10$ meters per point) approximating constantly has a low impact on the data utility.
% On the contrary, the lower density of the T-drive data set (Figure~\ref{fig:t-drive}) has a negative impact on the performance of Skip. % On the contrary, the lower density of the T-drive data set (Figure~\ref{fig:t-drive}) has a negative impact on the performance of Skip.
For the Copenhagen data set (Figure~\ref{fig:copenhagen}), Adaptive has a constant overall performance and performs best for $0$, $60$, and $80$\% {\thethings}. For the Copenhagen data set (Figure~\ref{fig:copenhagen}), Adaptive has a constant overall performance and performs best for $0$\%, $60$\%, and $80$\% {\thethings}.
The Skip model excels, compared to the others, at cases where it needs to approximate a lot ($100$\%). We notice that for $0$\% {\thethings}, it achieves better utility than the event-level protection.
The combination of the low range in HUE ($[0.28$, $4.45]$ with an average of $0.88$kWh) and the large scale in the Laplace mechanism results in a low mean absolute error for Skip(Figure~\ref{fig:hue}). The Skip model excels, compared to the others, at cases where it needs to approximate $20$\%--$40$\% or $100$\% of the times.
The combination of the low range in HUE ($[0.28$, $4.45]$ with an average of $0.88$kWh) and the large scale in the Laplace mechanism, results in a low mean absolute error for Skip (Figure~\ref{fig:hue}).
In general, a scheme that favors approximation over noise injection would achieve a better performance in this case. In general, a scheme that favors approximation over noise injection would achieve a better performance in this case.
However, the Adaptive model performs by far better than Uniform and strikes a nice balance between event- and user-level protection for all {\thethings} percentages. However, the Adaptive model performs by far better than Uniform and strikes a nice balance between event- and user-level protection for all {\thething} percentages.
In the T-drive data set (Figure~\ref{fig:t-drive}), the Adaptive mechanism outperforms the Uniform one by $10$\%--$20$\% for all {\thethings} percentages greater than $40$ and by more than $20$\% the Skip one. In the T-drive data set (Figure~\ref{fig:t-drive}), the Adaptive mechanism outperforms Uniform by $10$\%--$20$\% for all {\thething} percentages greater than $40$\% and Skip by more than $20$\%.
The lower density (average distance of $623$ meters) of the T-drive data set has a negative impact on the performance of Skip. The lower density (average distance of $623$m) of the T-drive data set has a negative impact on the performance of Skip.
In general, we can claim that the Adaptive is the most reliable and best performing mechanism with minimal tuning, if we take into consideration the drawbacks of the Skip mechanism mentioned in Section~\ref{subsec:lmdk-mechs}. In general, we can claim that the Adaptive is the most reliable and best performing mechanism with minimal tuning, if we take into consideration the drawbacks of the Skip mechanism mentioned in Section~\ref{subsec:lmdk-mechs}.
Moreover, designing a data-dependent sampling scheme would possibly result in better results for Adaptive. Moreover, designing a data-dependent sampling scheme would possibly result in better results for Adaptive.
@ -54,10 +50,6 @@ Moreover, designing a data-dependent sampling scheme would possibly result in be
Figure~\ref{fig:avg-dist} shows a comparison of the average temporal distance of the events from the previous/next {\thething} or the start/end of the time series for various distributions in synthetic data. Figure~\ref{fig:avg-dist} shows a comparison of the average temporal distance of the events from the previous/next {\thething} or the start/end of the time series for various distributions in synthetic data.
More particularly, we count for every event the total number of events between itself and the nearest {\thething} or the series edge. More particularly, we count for every event the total number of events between itself and the nearest {\thething} or the series edge.
We observe that the uniform and bimodal distributions tend to limit the regular event--{\thething} distance.
This is due to the fact that the former scatters the {\thethings}, while the latter distributes them on both edges, leaving a shorter space uninterrupted by {\thethings}.
% and as a result they reduce the uninterrupted space by landmarks in the sequence.
On the contrary, distributing the {\thethings} at one part of the sequence, as in skewed or symmetric, creates a wider space without {\thethings}.
\begin{figure}[htp] \begin{figure}[htp]
\centering \centering
@ -66,14 +58,13 @@ On the contrary, distributing the {\thethings} at one part of the sequence, as i
\label{fig:avg-dist} \label{fig:avg-dist}
\end{figure} \end{figure}
We observe that the uniform and bimodal distributions tend to limit the regular event--{\thething} distance.
This is due to the fact that the former scatters the {\thethings}, while the latter distributes them on both edges, leaving a shorter space uninterrupted by {\thethings}.
% and as a result they reduce the uninterrupted space by landmarks in the sequence.
On the contrary, distributing the {\thethings} at one part of the sequence, as in skewed or symmetric, creates a wider space without {\thethings}.
Figure~\ref{fig:dist-cor} illustrates a comparison among the aforementioned distributions regarding the overall privacy loss under (a)~weak, (b)~moderate, and (c)~strong temporal correlation degrees. Figure~\ref{fig:dist-cor} illustrates a comparison among the aforementioned distributions regarding the overall privacy loss under (a)~weak, (b)~moderate, and (c)~strong temporal correlation degrees.
The line shows the overall privacy loss---for all cases of {\thethings} distribution---without temporal correlation. The line shows the overall privacy loss---for all cases of {\thethings} distribution---without temporal correlation.
In combination with Figure~\ref{fig:avg-dist}, we conclude that a greater average event--{\thething} distance in a distribution can result into greater overall privacy loss under moderate and strong temporal correlation.
This is due to the fact that the backward/forward privacy loss accumulates more over time in wider spaces without {\thethings} (see Section~\ref{sec:correlation}).
Furthermore, the behavior of the privacy loss is as expected regarding the temporal correlation degree.
Predictably, a stronger correlation degree generates higher privacy loss while widening the gap between the different distribution cases.
On the contrary, a weaker correlation degree makes it harder to differentiate among the {\thethings} distributions.
The privacy loss under a weak correlation degree converge.
\begin{figure}[htp] \begin{figure}[htp]
\centering \centering
@ -91,3 +82,10 @@ The privacy loss under a weak correlation degree converge.
The line shows the overall privacy loss without temporal correlation.} The line shows the overall privacy loss without temporal correlation.}
\label{fig:dist-cor} \label{fig:dist-cor}
\end{figure} \end{figure}
In combination with Figure~\ref{fig:avg-dist}, we conclude that a greater average event--{\thething} even distance in a distribution can result into greater overall privacy loss under moderate and strong temporal correlation.
This is due to the fact that the backward/forward privacy loss accumulates more over time in wider spaces without {\thethings} (see Section~\ref{sec:correlation}).
Furthermore, the behavior of the privacy loss is as expected regarding the temporal correlation degree.
Predictably, a stronger correlation degree generates higher privacy loss while widening the gap between the different distribution cases.
On the contrary, a weaker correlation degree makes it harder to differentiate among the {\thethings} distributions.
The privacy loss under a weak correlation degree converge.

View File

@ -1,4 +1,4 @@
\chapter{Landmark Privacy} \chapter{{\Thething} privacy}
\label{ch:lmdk-prv} \label{ch:lmdk-prv}
% Crowdsensing applications % Crowdsensing applications