Merge branch 'master' of git.delkappa.com:manos/the-last-thing

2021-10-11 18:33:11 +02:00
parent b0eb3cb235 3fc928fc49
commit eccc28748a
27 changed files with 188 additions and 84 deletions
--- a/code/expt/copenhagen-sel.py
+++ b/code/expt/copenhagen-sel.py
@ -68,29 +68,30 @@ def main(args):

    for _ in range(args.iter):

-      lmdks, eps_out = lmdk_sel.find_lmdks(seq, lmdks, epsilon)
+      lmdks_sel, eps_out = lmdk_sel.find_lmdks(seq, lmdks, epsilon)

      # Skip
-      rls_data_s, bgts_s = lmdk_bgt.skip_cont(seq, lmdks, eps_out)
+      rls_data_s, bgts_s = lmdk_bgt.skip_cont(seq, lmdks_sel, eps_out)
      # lmdk_bgt.validate_bgts(seq, lmdks, epsilon, bgts_s)
      mae_s[i] += (lmdk_bgt.mae_cont(rls_data_s)/args.iter)*100

      # Uniform
-      rls_data_u, bgts_u = lmdk_bgt.uniform_cont(seq, lmdks, eps_out)
+      rls_data_u, bgts_u = lmdk_bgt.uniform_cont(seq, lmdks_sel, eps_out)
      # lmdk_bgt.validate_bgts(seq, lmdks, epsilon, bgts_u)
      mae_u[i] += (lmdk_bgt.mae_cont(rls_data_u)/args.iter)*100

      # Adaptive
-      rls_data_a, _, _ = lmdk_bgt.adaptive_cont(seq, lmdks, eps_out, .5, .5)
+      rls_data_a, _, _ = lmdk_bgt.adaptive_cont(seq, lmdks_sel, eps_out, .5, .5)
      mae_a[i] += (lmdk_bgt.mae_cont(rls_data_a)/args.iter)*100

      # Calculate once
-      if i == 0:
+      if pct == lmdks_pct[0]:
        # Event
-        rls_data_evt, _ = lmdk_bgt.uniform_cont(seq, lmdk_lib.find_lmdks_cont(lmdk_data, seq, uid, 0), epsilon)
+        rls_data_evt, _ = lmdk_bgt.uniform_cont(seq, lmdks, epsilon)
        mae_evt += (lmdk_bgt.mae_cont(rls_data_evt)/args.iter)*100
+      elif pct == lmdks_pct[-1]:
        # User
-        rls_data_usr, _ = lmdk_bgt.uniform_cont(seq, lmdk_lib.find_lmdks_cont(lmdk_data, seq, uid, 100), epsilon)
+        rls_data_usr, _ = lmdk_bgt.uniform_cont(seq, lmdks, epsilon)
        mae_usr += (lmdk_bgt.mae_cont(rls_data_usr)/args.iter)*100

  plt.axhline(
--- a/code/expt/copenhagen.py
+++ b/code/expt/copenhagen.py
@ -80,12 +80,13 @@ def main(args):
      mae_a[i] += (lmdk_bgt.mae_cont(rls_data_a)/args.iter)*100

      # Calculate once
-      if i == 0:
+      if pct == lmdks_pct[0]:
        # Event
-        rls_data_evt, _ = lmdk_bgt.uniform_cont(seq, lmdk_lib.find_lmdks_cont(lmdk_data, seq, uid, 0), epsilon)
+        rls_data_evt, _ = lmdk_bgt.uniform_cont(seq, lmdks, epsilon)
        mae_evt += (lmdk_bgt.mae_cont(rls_data_evt)/args.iter)*100
+      elif pct == lmdks_pct[-1]:
        # User
-        rls_data_usr, _ = lmdk_bgt.uniform_cont(seq, lmdk_lib.find_lmdks_cont(lmdk_data, seq, uid, 100), epsilon)
+        rls_data_usr, _ = lmdk_bgt.uniform_cont(seq, lmdks, epsilon)
        mae_usr += (lmdk_bgt.mae_cont(rls_data_usr)/args.iter)*100

  plt.axhline(
--- a/code/expt/hue-sel.py
+++ b/code/expt/hue-sel.py
@ -48,7 +48,7 @@ def main(args):
  # The y axis
  plt.ylabel('Mean absolute error (kWh)')  # Set y axis label.
  plt.yscale('log')
-  plt.ylim(.1, 10000)
+  plt.ylim(.1, 100000)
  # Bar offset
  x_offset = -(bar_width/2)*(n - 1)

@ -80,13 +80,13 @@ def main(args):
      mae_a[i] += lmdk_bgt.mae_cons(seq, rls_data_a)/args.iter

      # Calculate once
-      # Event
-      if i == 0:
-        rls_data_evt, _ = lmdk_bgt.uniform_cons(seq, seq[seq[:, 1] < lmdks_th[0]], epsilon)
+      if pct == lmdks_pct[0]:
+        # Event
+        rls_data_evt, _ = lmdk_bgt.uniform_cons(seq, lmdks, epsilon)
        mae_evt += lmdk_bgt.mae_cons(seq, rls_data_evt)/args.iter
-      # User
-      if i == 0:
-        rls_data_usr, _ = lmdk_bgt.uniform_cons(seq, seq[seq[:, 1] < lmdks_th[len(lmdks_th)-1]], epsilon)
+      elif pct == lmdks_pct[-1]:
+        # User
+        rls_data_usr, _ = lmdk_bgt.uniform_cons(seq, lmdks, epsilon)
        mae_usr += lmdk_bgt.mae_cons(seq, rls_data_usr)/args.iter

  plt.axhline(
--- a/code/expt/hue.py
+++ b/code/expt/hue.py
@ -46,7 +46,7 @@ def main(args):
  # The y axis
  plt.ylabel('Mean absolute error (kWh)')  # Set y axis label.
  plt.yscale('log')
-  plt.ylim(.1, 10000)
+  plt.ylim(.1, 100000)
  # Bar offset
  x_offset = -(bar_width/2)*(n - 1)

@ -75,13 +75,13 @@ def main(args):
      mae_a[i] += lmdk_bgt.mae_cons(seq, rls_data_a)/args.iter

      # Calculate once
-      # Event
-      if i == 0:
-        rls_data_evt, _ = lmdk_bgt.uniform_cons(seq, seq[seq[:, 1] < lmdks_th[0]], epsilon)
+      if pct == lmdks_pct[0]:
+        # Event
+        rls_data_evt, _ = lmdk_bgt.uniform_cons(seq, lmdks, epsilon)
        mae_evt += lmdk_bgt.mae_cons(seq, rls_data_evt)/args.iter
-      # User
-      if i == 0:
-        rls_data_usr, _ = lmdk_bgt.uniform_cons(seq, seq[seq[:, 1] < lmdks_th[len(lmdks_th)-1]], epsilon)
+      elif pct == lmdks_pct[-1]:
+        # User
+        rls_data_usr, _ = lmdk_bgt.uniform_cons(seq, lmdks, epsilon)
        mae_usr += lmdk_bgt.mae_cons(seq, rls_data_usr)/args.iter

  plt.axhline(
--- a/code/expt/lmdk_sel_cmp.py
+++ b/code/expt/lmdk_sel_cmp.py
@ -20,7 +20,15 @@ def main(args):
  # Distribution type
  dist_type = np.array(range(0, 4))
  # Number of landmarks
-  lmdk_n = np.array(range(int(.2*args.time), args.time, int(args.time/5)))
+  lmdk_n = np.array(range(0, args.time + 1, int(args.time/5)))
+
+  markers = [
+    '^', # Symmetric
+    'v', # Skewed
+    'D', # Bimodal
+    's'  # Uniform
+  ]
+
  # Initialize plot
  lmdk_lib.plot_init()
  # Width of bars
@ -30,11 +38,13 @@ def main(args):
  x_margin = bar_width*(len(dist_type)/2 + 1)
  plt.xticks(x_i, ((lmdk_n/len(seq))*100).astype(int))
  plt.xlabel('Landmarks (%)')  # Set x axis label.
-  plt.xlim(x_i.min() - x_margin, x_i.max() + x_margin)
+  # plt.xlim(x_i.min() - x_margin, x_i.max() + x_margin)
+  plt.xlim(x_i.min(), x_i.max())
  # The y axis
  # plt.yscale('log')
-  plt.ylabel('Euclidean distance')  # Set y axis label.
-  # plt.ylabel('Wasserstein distance')  # Set y axis label.
+  plt.ylim(0, 1)
+  plt.ylabel('Normalized Euclidean distance')  # Set y axis label.
+  # plt.ylabel('Normalized Wasserstein distance')  # Set y axis label.
  # Bar offset
  x_offset = -(bar_width/2)*(len(dist_type) - 1)
  for d_i, d in enumerate(dist_type):
@ -47,27 +57,41 @@ def main(args):
    print('(%d/%d) %s... ' %(d_i + 1, len(dist_type), title), end='', flush=True)
    mae = np.zeros(len(lmdk_n))
    for n_i, n in enumerate(lmdk_n):
-      for r in range(args.reps):
+      if n == lmdk_n[-1]:
+        break
+      for r in range(args.iter):
        lmdks = lmdk_lib.get_lmdks(seq, n, d)
        hist, h = lmdk_lib.get_hist(seq, lmdks)
        opts = lmdk_sel.get_opts_from_top_h(seq, lmdks)
        delta = 1.0
        res, _ = exp_mech.exponential(hist, opts, exp_mech.score, delta, epsilon)
-        mae[n_i] += lmdk_lib.get_norm(hist, res)/args.reps  # Euclidean
-        # mae[n_i] += lmdk_lib.get_emd(hist, res)/args.reps  # Wasserstein
+        mae[n_i] += lmdk_lib.get_norm(hist, res)/args.iter  # Euclidean
+        # mae[n_i] += lmdk_lib.get_emd(hist, res)/args.iter  # Wasserstein
+    mae = mae/21  # Euclidean
+    # mae = mae/11.75  # Wasserstein
    print('[OK]', flush=True)
-    # Plot bar for current distribution
-    plt.bar(
-      x_i + x_offset,
+    # # Plot bar for current distribution
+    # plt.bar(
+    #   x_i + x_offset,
+    #   mae,
+    #   bar_width,
+    #   label=label,
+    #   linewidth=lmdk_lib.line_width
+    # )
+    # # Change offset for next bar
+    # x_offset += bar_width
+    # Plot line
+    plt.plot(
+      x_i,
      mae,
-      bar_width,
      label=label,
+      marker=markers[d_i],
+      markersize=lmdk_lib.marker_size,
+      markeredgewidth=0,
      linewidth=lmdk_lib.line_width
    )
-    # Change offset for next bar
-    x_offset += bar_width
-  path = str('../../rslt/lmdk_sel_cmp/' + 'lmdk_sel_cmp-norm')
-  # path = str('../../rslt/lmdk_sel_cmp/' + 'lmdk_sel_cmp-emd')
+  path = str('../../rslt/lmdk_sel_cmp/' + 'lmdk_sel_cmp-norm-l')
+  # path = str('../../rslt/lmdk_sel_cmp/' + 'lmdk_sel_cmp-emd-l')
  # Plot legend
  lmdk_lib.plot_legend()
  # Show plot
@ -81,7 +105,7 @@ def main(args):
  Parse arguments.

  Optional:
-    reps - The number of repetitions.
+    iter - The number of iterations.
    time - The time limit of the sequence.
 '''
 def parse_args():
@ -91,7 +115,7 @@ def parse_args():
  # Mandatory arguments.

  # Optional arguments.
-  parser.add_argument('-r', '--reps', help='The number of repetitions.', type=int, default=1)
+  parser.add_argument('-i', '--iter', help='The number of iterations.', type=int, default=1)
  parser.add_argument('-t', '--time', help='The time limit of the sequence.', type=int, default=100)

  # Parse arguments.
--- a/code/expt/t-drive-sel.py
+++ b/code/expt/t-drive-sel.py
@ -70,7 +70,7 @@ def main(args):
    # The y axis
    plt.ylabel('Mean absolute error (m)')  # Set y axis label.
    plt.yscale('log')
-    # plt.ylim(1, 100000000)
+    plt.ylim(1, 1000000)
    # Bar offset
    x_offset = -(bar_width/2)*(n - 1)

@ -101,12 +101,13 @@ def main(args):
          rls_data_a, _, _ = lmdk_bgt.adaptive(seq, lmdks, eps_out, .5, .5)
          mae_a[i] += lmdk_bgt.mae(seq, rls_data_a)/args.iter

-          # Event
-          if lmdk == 0:
+          # Calculate once
+          if lmdk == min(data_info[d]['lmdks']):
+            # Event
            rls_data_evt, _ = lmdk_bgt.uniform_r(seq, lmdks, bgt['epsilon'])
            mae_evt += lmdk_bgt.mae(seq, rls_data_evt)/args.iter
-          # User
-          if lmdk == 100:
+          elif lmdk == max(data_info[d]['lmdks']):
+            # User
            rls_data_usr, _ = lmdk_bgt.uniform_r(seq, lmdks, bgt['epsilon'])
            mae_usr += lmdk_bgt.mae(seq, rls_data_usr)/args.iter

--- a/code/expt/t-drive.py
+++ b/code/expt/t-drive.py
@ -68,7 +68,7 @@ def main(args):
    # The y axis
    plt.ylabel('Mean absolute error (m)')  # Set y axis label.
    plt.yscale('log')
-    # plt.ylim(1, 100000000)
+    plt.ylim(1, 1000000)
    # Bar offset
    x_offset = -(bar_width/2)*(n - 1)

@ -103,12 +103,13 @@ def main(args):
          # mae_d[i] += lmdk_bgt.mae(seq, rls_data_a)/args.iter
          # s_d += s_d_c/args.iter

-          # Event
-          if lmdk == 0:
+          # Calculate once
+          if lmdk == min(data_info[d]['lmdks']):
+            # Event
            rls_data_evt, _ = lmdk_bgt.uniform_r(seq, lmdks, bgt['epsilon'])
            mae_evt += lmdk_bgt.mae(seq, rls_data_evt)/args.iter
-          # User
-          if lmdk == 100:
+          elif lmdk == max(data_info[d]['lmdks']):
+            # User
            rls_data_usr, _ = lmdk_bgt.uniform_r(seq, lmdks, bgt['epsilon'])
            mae_usr += lmdk_bgt.mae(seq, rls_data_usr)/args.iter

--- a/code/lib/lmdk_bgt.py
+++ b/code/lib/lmdk_bgt.py
@ -558,10 +558,10 @@ def skip_cont(seq, lmdks, epsilon):
    # Add noise
    o = lmdk_lib.randomized_response(is_landmark, bgts[i])
    if is_landmark:
+      bgts[i] = 0
      if i > 0:
        # Approximate with previous
        o = rls_data[i - 1][1]
-        bgts[i] = 0
    rls_data[i] = [is_landmark, o]
  return rls_data, bgts

--- a/graphics/evaluation/copenhagen-sel.pdf
+++ b/graphics/evaluation/copenhagen-sel.pdf
--- a/graphics/evaluation/hue-sel.pdf
+++ b/graphics/evaluation/hue-sel.pdf
--- a/graphics/evaluation/hue.pdf
+++ b/graphics/evaluation/hue.pdf
--- a/graphics/evaluation/sel-dist-emd.pdf
+++ b/graphics/evaluation/sel-dist-emd.pdf
--- a/graphics/evaluation/sel-dist-norm.pdf
+++ b/graphics/evaluation/sel-dist-norm.pdf
--- a/graphics/evaluation/t-drive-sel.pdf
+++ b/graphics/evaluation/t-drive-sel.pdf
--- a/graphics/evaluation/t-drive.pdf
+++ b/graphics/evaluation/t-drive.pdf
--- a/rslt/bgt_cmp/Copenhagen-sel.pdf
+++ b/rslt/bgt_cmp/Copenhagen-sel.pdf
--- a/rslt/bgt_cmp/Copenhagen.pdf
+++ b/rslt/bgt_cmp/Copenhagen.pdf
--- a/rslt/bgt_cmp/HUE-sel.pdf
+++ b/rslt/bgt_cmp/HUE-sel.pdf
--- a/rslt/bgt_cmp/HUE.pdf
+++ b/rslt/bgt_cmp/HUE.pdf
--- a/rslt/lmdk_sel_cmp/lmdk_sel_cmp-emd-l.pdf
+++ b/rslt/lmdk_sel_cmp/lmdk_sel_cmp-emd-l.pdf
--- a/rslt/lmdk_sel_cmp/lmdk_sel_cmp-norm-l.pdf
+++ b/rslt/lmdk_sel_cmp/lmdk_sel_cmp-norm-l.pdf
--- a/text/evaluation/details.tex
+++ b/text/evaluation/details.tex
@ -43,7 +43,7 @@ We take into account only the temporal order of the points and the position of r
 \subsection{Configurations}
 \label{subsec:eval-conf}

-\subsubsection{{\Thethings}' percentage}
+\subsubsection{{\Thething} percentage}

 For the Copenhagen data set, we achieve 
 $0\%$ {\thethings} by considering an empty list of contact devices,
@ -53,18 +53,22 @@ $60\%$ with $[181$, $182$, $192$, $195$, $196$, $201$, $203$, $207$, $221$, $230
 $80\%$ with $[260$, $282$, $287$, $289$, $290$, $291$, $308$, $311$, $318$, $323$, $324$, $330$, $334$, $335$, $344$, $350$, $353$, $355$, $357$, $358$, $361$, $363]$, and 
 $100\%$ by including all of the possible contacts.

-In HUE, we get $0$, $20$ $40$, $60$, $80$, and $100$ {\thethings} percentages by setting the energy consumption threshold below $0.28$, $1.12$, $0.88$, $0.68$, $0.54$, $4.45$kWh respectively.
+In HUE, we get $0$\%, $20$\% $40$\%, $60$\%, $80$\%, and $100$\% {\thethings} by setting the energy consumption threshold below $0.28$kWh, $1.12$kWh, $0.88$kWh, $0.68$kWh, $0.54$kWh, $4.45$kWh respectively.

-In T-drive, we achieved the desired {\thethings} percentages by utilizing the method of Li et al.~\cite{li2008mining} for detecting stay points in trajectory data.
+In T-drive, we achieved the desired {\thething} percentages by utilizing the method of Li et al.~\cite{li2008mining} for detecting stay points in trajectory data.
 In more detail, the algorithm checks for each data item if each subsequent item is within a given distance threshold $\Delta l$ and measures the time period $\Delta t$ between the present point and the last subsequent point.
-We achieve $0$, $20$ $40$, $60$, $80$, and $100$ {\thethings} percentages by setting the ($\Delta l$ in meters, $\Delta t$ in minutes) pairs input to the stay point discovery method as [($0$, $1000$), ($2095$, $30$), ($2790$, $30$), ($3590$, $30$), ($4825$, $30$), ($10350$, $30$)].
+We achieve $0$\%, $20$\% $40$\%, $60$\%, $80$\%, and $100$\% {\thethings} by setting the ($\Delta l$ in meters, $\Delta t$ in minutes) pairs input to the stay point discovery method as [($0$, $1000$), ($2095$, $30$), ($2790$, $30$), ($3590$, $30$), ($4825$, $30$), ($10350$, $30$)].

 We generated synthetic data with \emph{skewed} (the {\thethings} are distributed towards the beginning/end of the series), \emph{symmetric} (in the middle), \emph{bimodal} (both end and beginning), and \emph{uniform} (all over the time series) {\thething} distributions.
 In order to get {\thethings} with the above distribution features, we generate probability distributions with appropriate characteristics and sample from them, without replacement, the desired number of points.
 %The generated distributions are representative of the cases that we wish to examine during the experiments.
-For example, for a left-skewed {\thethings} distribution we would utilize a truncated distribution resulting from the restriction of the domain of a distribution to the beginning and end of the time series with its location shifted to the center of the right half of the series.
+For example, for a left-skewed {\thething} distribution we would utilize a truncated distribution resulting from the restriction of the domain of a distribution to the beginning and end of the time series with its location shifted to the center of the right half of the series.
 For consistency, we calculate the scale parameter depending on the length of the series by setting it equal to the series' length over a constant.

+Notice that in our experiments, in the cases when we have $0\%$ and $100\%$ of the events being {\thethings}, we get the same behavior as in event- and user-level privacy respectively.
+This happens due the fact that at each timestamp we take into account only the data items at the current timestamp and ignore the rest of the time series (event-level) when there are no {\thethings}.
+Whereas, when each timestamp corresponds to a {\thething} we consider and protect all the events throughout the entire series (user-level).
+

 \subsubsection{Privacy parameters}

--- a/text/evaluation/main.tex
+++ b/text/evaluation/main.tex
@ -1,6 +1,13 @@
 \chapter{Evaluation}
 \label{ch:eval}

+In this chapter we present the experiments that we performed, to evaluate the methodology that we introduced in chapter~\ref{ch:lmdk-prv}, on real and synthetic data sets.
+Section~\ref{sec:eval-dtl} contains all the details regarding the data sets the we utilized for our experiments (Section~\ref{subsec:eval-dat}) along with the parameter configurations.
+Section~\ref{sec:eval-lmdk} evaluates the data utility of the {\thething} privacy mechanisms that we designed in Section~\ref{sec:thething} and investigates the behavior of the privacy loss under temporal correlation for different distributions of {\thethings}.
+Section~\ref{sec:eval-lmdk-sel} justifies our decisions while designing the privacy-preserving {\thething} selection component in Section~\ref{sec:theotherthing} and the data utility impact of the latter.
+Finally, Section~\ref{sec:eval-sum} concludes this chapter by summarizing the main takeaways of the results of the experiments that we performed.
+
 \input{evaluation/details}
 \input{evaluation/thething}
 \input{evaluation/theotherthing}
+\input{evaluation/summary}
--- a/text/evaluation/summary.tex
+++ b/text/evaluation/summary.tex
@ -0,0 +1,8 @@
+\section{Summary}
+\label{sec:eval-sum}
+
+In this chapter we presented the experimental evaluation of the {\thething} privacy mechanisms and privacy-preserving {\thething} selection mechanism, that we developed in chapter~\ref{ch:lmdk-prv}, on real and synthetic data sets.
+The Adaptive mechanism is the most reliable and best performing mechanism, in terms of overall data utility, with minimal tuning across most cases.
+Skip performs optimally in data sets with a lower value range where approximation fits best.
+The {\thething} selection component introduces a reasonable data utility decline to all of our mechanisms however, the Adaptive handles it well and bounds the data utility to higher levels compared to user-level protection.
+In terms of temporal correlation, we observe that under moderate and strong temporal correlation, a greater average regular--{\thething} event distance in a {\thething} distribution causes greater overall privacy loss.
--- a/text/evaluation/theotherthing.tex
+++ b/text/evaluation/theotherthing.tex
@ -1,2 +1,61 @@
 \section{Selection of events}
-\label{sec:lmdk-sel-eval}
+\label{sec:eval-lmdk-sel}
+
+In this section, we present the experiments that we performed, to test the methodology that we presented in Section~\ref{subsec:lmdk-sel-sol}, on real and synthetic data sets.
+With the experiments on the synthetic data sets (Section~\ref{subsec:sel-utl}) we show the normalized Euclidean and Wasserstein distances of the time series histograms for various distributions and {\thething} percentages.
+This allows us to justify our design decisions for our concept that we showcased in Section~\ref{subsec:lmdk-sel-sol}.
+With the experiments on the real data sets (Section~\ref{subsec:sel-prv}), we show the performance in terms of utility of our three {\thething} mechanisms in combination with the privacy preserving {\thething} selection component.
+
+
+\subsection{{\Thething} selection utility metrics}
+\label{subsec:sel-utl}
+
+Figure~\ref{fig:sel-dist} demonstrates the normalized distance that we obtain when we utilize either (a)~the Euclidean or (b)~the Wasserstein distance metric to obtain a set of {\thethings} including regular events.
+
+\begin{figure}[htp]
+  \centering
+  \subcaptionbox{Euclidean\label{fig:sel-dist-norm}}{%
+    \includegraphics[width=.5\linewidth]{evaluation/sel-dist-norm}%
+  }%
+  \subcaptionbox{Wasserstein\label{fig:sel-dist-emd}}{%
+    \includegraphics[width=.5\linewidth]{evaluation/sel-dist-emd}%
+  }%
+  \caption{The normalized (a)~Euclidean, and (b)~Wasserstein distance of the generated {\thething} sets for different {\thething} percentages.}
+  \label{fig:sel-dist}
+\end{figure}
+
+Comparing the results of the Euclidean distance in Figure~\ref{fig:sel-dist-norm} with those of the Wasserstein in Figure~\ref{fig:sel-dist-emd} we conclude that the Euclidean distance provides more consistent results for all possible distributions.
+% (0 + (0.25 + 0.25 + 0.3 + 0.3)/4 + (0.45 + 0.45 + 0.45 + 0.5)/4 + (0.5 + 0.5 + 0.7 + 0.7)/4 + (0.6 + 0.6 + 1 + 1)/4 + (0.3 + 0.3 + 0.3 + 0.3)/4)/6
+% (0 + (0.15 + 0.15 + 0.15 + 0.15)/4 + (0.2 + 0.2 + 0.3 + 0.4)/4 + (0.3 + 0.3 + 0.6 + 0.6)/4 + (0.3 + 0.3 + 1 + 1)/4 + (0.05 + 0.05 + 0.05 + 0.05)/4)
+The maximum difference is approximately $0.4$ for the former and $0.7$ for the latter between the bimodal and skewed {\thething} distribution.
+While both methods share the same mean normalized distance of $0.4$, the Euclidean distance demonstrates a more consistent performance among all possible {\thething} distributions.
+Therefore, we choose to utilize the Euclidean distance metric for the implementation of the privacy-preserving {\thething} selection in Section~\ref{subsec:lmdk-sel-sol}.
+
+
+\subsection{Budget allocation and {\thething} selection}
+\label{subsec:sel-prv}
+
+Figure~\ref{fig:real-sel} exhibits the performance of Skip, Uniform, and Adaptive (see Section~\ref{subsec:lmdk-mechs}) in combination with the {\thething} selection component.
+
+\begin{figure}[htp]
+  \centering
+  \subcaptionbox{Copenhagen\label{fig:copenhagen-sel}}{%
+    \includegraphics[width=.5\linewidth]{evaluation/copenhagen-sel}%
+  }%
+  \hspace{\fill}
+  \subcaptionbox{HUE\label{fig:hue-sel}}{%
+    \includegraphics[width=.5\linewidth]{evaluation/hue-sel}%
+  }%
+  \subcaptionbox{T-drive\label{fig:t-drive-sel}}{%
+    \includegraphics[width=.5\linewidth]{evaluation/t-drive-sel}%
+  }%
+  \caption{The mean absolute error (a)~as a percentage, (b)~in kWh, and (c)~in meters of the released data for different {\thething} percentages.}
+  \label{fig:real-sel}
+\end{figure}
+
+In comparison with the utility performance without the {\thething} selection component (Figure~\ref{fig:real}), we notice a slight deterioration for all three models.
+This is natural since we allocated part of the available privacy budget to the privacy-preserving {\thething} selection component which in turn increased the number of {\thethings}.
+Therefore, there is less privacy budget available for data publishing throughout the time series for $0$\% and $100$\% {\thethings}.
+Skip performs best in our experiments with HUE, due to the low range in the energy consumption and the high scale of the Laplace noise which it avoids due to its tendency to approximate.
+However, for the Copenhagen data set and T-drive it attains greater mean absolute error than the user-level protection scheme. 
+Overall, Adaptive has a consistent performance in terms of utility for all of the data sets that we experimented with.
--- a/text/evaluation/thething.tex
+++ b/text/evaluation/thething.tex
@ -1,16 +1,11 @@
 \section{Significant events}
-\label{sec:lmdk-eval}
+\label{sec:eval-lmdk}

 % \kat{After discussing with Dimitris, I thought you are keeping one chapter for the proposals of the thesis. In this case, it would be more clean to keep the theoretical contributions in one chapter and the evaluation in a separate chapter. }
 % \mk{OK.}
-In this section we present the experiments that we performed on real and synthetic data sets. 
+In this section, we present the experiments that we performed, to test the methodology that we presented in Section~\ref{subsec:lmdk-sol}, on real and synthetic data sets. 
 With the experiments on the real data sets (Section~\ref{subsec:lmdk-expt-bgt}), we show the performance in terms of utility of our three {\thething} mechanisms.
-With the experiments on the synthetic data sets (Section~\ref{subsec:lmdk-expt-cor}) we show the privacy loss by our framework when tuning the size and statistical characteristics of the input {\thething} set $L$ with special emphasis on how the privacy loss under temporal correlation is affected by the number and distribution of the {\thethings}. 
-
-
-Notice that in our experiments, in the cases when we have $0\%$ and $100\%$ of the events being {\thethings}, we get the same behavior as in event- and user-level privacy respectively.
-This happens due the fact that at each timestamp we take into account only the data items at the current timestamp and ignore the rest of the time series (event-level) when there are no {\thethings}.
-Whereas, when each timestamp corresponds to a {\thething} we consider and protect all the events throughout the entire series (user-level).
+With the experiments on the synthetic data sets (Section~\ref{subsec:lmdk-expt-cor}) we show the privacy loss by our framework when tuning the size and statistical characteristics of the input {\thething} set $L$ with special emphasis on how the privacy loss under temporal correlation is affected by the number and distribution of the {\thethings}.


 \subsection{Budget allocation schemes}
@ -30,20 +25,21 @@ Figure~\ref{fig:real} exhibits the performance of the three mechanisms: Skip, Un
  \subcaptionbox{T-drive\label{fig:t-drive}}{%
    \includegraphics[width=.5\linewidth]{evaluation/t-drive}%
  }%
-  \caption{The mean absolute error (a)~as a percentage, (b)~in kWh, and (c)~in meters of the released data for different {\thethings} percentages.}
+  \caption{The mean absolute error (a)~as a percentage, (b)~in kWh, and (c)~in meters of the released data for different {\thething} percentages.}
  \label{fig:real}
 \end{figure}

 % For the Geolife data set (Figure~\ref{fig:geolife}), Skip has the best performance (measured in Mean Absolute Error, in meters) because it invests the most budget overall at every regular event, by approximating the {\thething} data based on previous releases.
 % Due to the data set's high density (every $1$--$5$ seconds or every $5$--$10$ meters per point) approximating constantly has a low impact on the data utility.
 % On the contrary, the lower density of the T-drive data set (Figure~\ref{fig:t-drive}) has a negative impact on the performance of Skip.
-For the Copenhagen data set (Figure~\ref{fig:copenhagen}), Adaptive has a constant overall performance and performs best for $0$, $60$, and $80$\% {\thethings}.
-The Skip model excels, compared to the others, at cases where it needs to approximate a lot ($100$\%).
-The combination of the low range in HUE ($[0.28$, $4.45]$ with an average of $0.88$kWh) and the large scale in the Laplace mechanism results in a low mean absolute error for Skip(Figure~\ref{fig:hue}).
+For the Copenhagen data set (Figure~\ref{fig:copenhagen}), Adaptive has a constant overall performance and performs best for $0$\%, $60$\%, and $80$\% {\thethings}.
+We notice that for $0$\% {\thethings}, it achieves better utility than the event-level protection.
+The Skip model excels, compared to the others, at cases where it needs to approximate $20$\%--$40$\% or $100$\% of the times.
+The combination of the low range in HUE ($[0.28$, $4.45]$ with an average of $0.88$kWh) and the large scale in the Laplace mechanism, results in a low mean absolute error for Skip (Figure~\ref{fig:hue}).
 In general, a scheme that favors approximation over noise injection would achieve a better performance in this case.
-However, the Adaptive model performs by far better than Uniform and strikes a nice balance between event- and user-level protection for all {\thethings} percentages.
-In the T-drive data set (Figure~\ref{fig:t-drive}), the Adaptive mechanism outperforms the Uniform one by $10$\%--$20$\% for all {\thethings} percentages greater than $40$ and by more than $20$\% the Skip one.
-The lower density (average distance of $623$ meters) of the T-drive data set has a negative impact on the performance of Skip.
+However, the Adaptive model performs by far better than Uniform and strikes a nice balance between event- and user-level protection for all {\thething} percentages.
+In the T-drive data set (Figure~\ref{fig:t-drive}), the Adaptive mechanism outperforms Uniform by $10$\%--$20$\% for all {\thething} percentages greater than $40$\% and Skip by more than $20$\%.
+The lower density (average distance of $623$m) of the T-drive data set has a negative impact on the performance of Skip.

 In general, we can claim that the Adaptive is the most reliable and best performing mechanism with minimal tuning, if we take into consideration the drawbacks of the Skip mechanism mentioned in Section~\ref{subsec:lmdk-mechs}.
 Moreover, designing a data-dependent sampling scheme would possibly result in better results for Adaptive.
@ -54,10 +50,6 @@ Moreover, designing a data-dependent sampling scheme would possibly result in be

 Figure~\ref{fig:avg-dist} shows a comparison of the average temporal distance of the events from the previous/next {\thething} or the start/end of the time series for various distributions in synthetic data.
 More particularly, we count for every event the total number of events between itself and the nearest {\thething} or the series edge.
-We observe that the uniform and bimodal distributions tend to limit the regular event--{\thething} distance.
-This is due to the fact that the former scatters the {\thethings}, while the latter distributes them on both edges, leaving a shorter space uninterrupted by {\thethings}.
-% and as a result they reduce the uninterrupted space by landmarks in the sequence.
-On the contrary, distributing the {\thethings} at one part of the sequence, as in skewed or symmetric, creates a wider space without {\thethings}.

 \begin{figure}[htp]
  \centering
@ -66,14 +58,13 @@ On the contrary, distributing the {\thethings} at one part of the sequence, as i
  \label{fig:avg-dist}
 \end{figure}

+We observe that the uniform and bimodal distributions tend to limit the regular event--{\thething} distance.
+This is due to the fact that the former scatters the {\thethings}, while the latter distributes them on both edges, leaving a shorter space uninterrupted by {\thethings}.
+% and as a result they reduce the uninterrupted space by landmarks in the sequence.
+On the contrary, distributing the {\thethings} at one part of the sequence, as in skewed or symmetric, creates a wider space without {\thethings}.
+
 Figure~\ref{fig:dist-cor} illustrates a comparison among the aforementioned distributions regarding the overall privacy loss under (a)~weak, (b)~moderate, and (c)~strong temporal correlation degrees.
 The line shows the overall privacy loss---for all cases of {\thethings} distribution---without temporal correlation.
-In combination with Figure~\ref{fig:avg-dist}, we conclude that a greater average event--{\thething} distance  in a distribution can result into greater overall privacy loss under moderate and strong temporal correlation.
-This is due to the fact that the backward/forward privacy loss accumulates more over time in wider spaces without {\thethings} (see Section~\ref{sec:correlation}).
-Furthermore, the behavior of the privacy loss is as expected regarding the temporal correlation degree.
-Predictably, a stronger correlation degree generates higher privacy loss while widening the gap between the different distribution cases.
-On the contrary, a weaker correlation degree makes it harder to differentiate among the {\thethings} distributions.
-The privacy loss under a weak correlation degree converge.

 \begin{figure}[htp]
  \centering
@ -91,3 +82,10 @@ The privacy loss under a weak correlation degree converge.
  The line shows the overall privacy loss without temporal correlation.}
  \label{fig:dist-cor}
 \end{figure}
+
+In combination with Figure~\ref{fig:avg-dist}, we conclude that a greater average event--{\thething} even distance in a distribution can result into greater overall privacy loss under moderate and strong temporal correlation.
+This is due to the fact that the backward/forward privacy loss accumulates more over time in wider spaces without {\thethings} (see Section~\ref{sec:correlation}).
+Furthermore, the behavior of the privacy loss is as expected regarding the temporal correlation degree.
+Predictably, a stronger correlation degree generates higher privacy loss while widening the gap between the different distribution cases.
+On the contrary, a weaker correlation degree makes it harder to differentiate among the {\thethings} distributions.
+The privacy loss under a weak correlation degree converge.
--- a/text/problem/main.tex
+++ b/text/problem/main.tex
@ -1,4 +1,4 @@
-\chapter{Landmark Privacy}
+\chapter{{\Thething} privacy}
 \label{ch:lmdk-prv}

 % Crowdsensing applications