lmdk-expt: Reviewed all graphs for synthetic

2021-10-09 13:27:16 +02:00
parent edb98f736d
commit b03b510f02
12 changed files with 1839 additions and 10 deletions
--- a/code/expt/avg_dist.py
+++ b/code/expt/avg_dist.py
@ -0,0 +1,121 @@
+#!/usr/bin/env python3
+
+import sys
+sys.path.insert(1, '../lib')
+import argparse
+import gdp
+import lmdk_lib
+import math
+from matplotlib import pyplot as plt
+import numpy as np
+import os
+import time
+
+
+def main(args):
+  # Number of timestamps
+  seq = lmdk_lib.get_seq(1, args.time)
+  # Distribution type
+  dist_type = np.array(range(0, 4))
+  # Number of landmarks
+  lmdk_n = np.array(range(0, args.time + 1, int(args.time/5)))
+
+  markers = [
+    '^', # Symmetric
+    'v', # Skewed
+    'D', # Bimodal
+    's'  # Uniform
+  ]
+
+  # Initialize plot
+  lmdk_lib.plot_init()
+  # The x axis
+  x_i = np.arange(len(lmdk_n))
+  plt.xticks(x_i, ((lmdk_n/len(seq))*100).astype(int))
+  plt.xlabel('Landmarks (%)')  # Set x axis label.
+  plt.xlim(x_i.min(), x_i.max())
+  # The y axis
+  plt.ylabel('Normalized average distance')  # Set y axis label.
+  plt.yscale('log')
+  plt.ylim(.001, 1)
+  # Logging
+  print('Average distance', end='', flush=True)
+  for d_i, d in enumerate(dist_type):
+    avg_dist = np.zeros(len(lmdk_n))
+    # Logging
+    print('.', end='', flush=True)
+    for i, n in enumerate(lmdk_n):
+      for r in range(args.reps):
+        # Generate landmarks
+        lmdks = lmdk_lib.get_lmdks(seq, n, d)
+        # Calculate average distance
+        avg_cur = 0
+        for t in seq:
+          t_prv, t_nxt = gdp.get_limits(t, seq, lmdks)
+          avg_cur += (abs(t - t_prv) - 1 + abs(t - t_nxt) - 1 )/len(seq)
+        # Normalized average based on repetitions
+        avg_dist[i] += avg_cur/args.reps
+    # Rescaling (min-max normalization)
+    # https://en.wikipedia.org/wiki/Feature_scaling#Rescaling_(min-max_normalization)
+    avg_dist = (avg_dist - avg_dist.min())/(avg_dist.max() - avg_dist.min())
+    # Normalize for log scale
+    if avg_dist[len(avg_dist) - 1] == 0:
+      avg_dist[len(avg_dist) - 1] = .001
+    # Set label
+    label = lmdk_lib.dist_type_to_str(d_i)
+    if d_i == 1:
+      label = 'Skewed'
+    # Plot line
+    plt.plot(
+      x_i,
+      avg_dist,
+      label=label,
+      marker=markers[d_i],
+      markersize=lmdk_lib.marker_size,
+      markeredgewidth=0,
+      linewidth=lmdk_lib.line_width
+    )
+  # Plot legend
+  lmdk_lib.plot_legend()
+  # Show plot
+  # plt.show()
+  # Save plot
+  lmdk_lib.save_plot(str('../../rslt/avg_dist/' + 'avg-dist' + '.pdf'))
+  print(' [OK]', flush=True)
+
+
+'''
+  Parse arguments.
+
+  Optional:
+    reps - The number of repetitions.
+    time - The time limit of the sequence.
+'''
+def parse_args():
+  # Create argument parser.
+  parser = argparse.ArgumentParser()
+
+  # Mandatory arguments.
+
+  # Optional arguments.
+  parser.add_argument('-r', '--reps', help='The number of repetitions.', type=int, default=1)
+  parser.add_argument('-t', '--time', help='The time limit of the sequence.', type=int, default=100)
+
+  # Parse arguments.
+  args = parser.parse_args()
+
+  return args
+
+
+if __name__ == '__main__':
+  try:
+    args = parse_args()
+    start_time = time.time()
+    main(args)
+    end_time = time.time()
+    print('##############################')
+    print('Time elapsed: %s' % (time.strftime('%H:%M:%S', time.gmtime(end_time - start_time))))
+    print('##############################')
+  except KeyboardInterrupt:
+    print('Interrupted by user.')
+    exit()
--- a/code/expt/dist_cor.py
+++ b/code/expt/dist_cor.py
@ -0,0 +1,131 @@
+#!/usr/bin/env python3
+
+import sys
+sys.path.insert(1, '../lib')
+import argparse
+import gdp
+import itertools
+import lmdk_bgt
+import lmdk_lib
+import numpy as np
+import os
+from matplotlib import pyplot as plt
+import time
+
+
+def main(args):
+  # Privacy goal
+  epsilon = 1.0
+  # Number of timestamps
+  seq = lmdk_lib.get_seq(1, args.time)
+  # Correlation degree (higher values means weaker correlations)
+  cor_deg = np.array([.01, .1, 1.0])
+  cor_lbl = ['Strong correlation', 'Moderate correlation', 'Weak correlation']
+  # Distribution type
+  dist_type = np.array(range(0, 4))
+  # Number of landmarks
+  lmdk_n = np.array(range(0, args.time + 1, int(args.time/5)))
+  # Width of bars
+  bar_width = 1/(len(dist_type) + 1)
+  # For each correlation degree
+  for c_i, c in enumerate(cor_deg):
+    # Logging
+    title = cor_lbl[c_i]
+    print('(%d/%d) %s' %(c_i + 1, len(cor_deg), title), end='', flush=True)
+    # The transition matrix
+    p = gdp.gen_trans_mt(2, c)
+    # Bar offset
+    x_offset = -(bar_width/2)*(len(dist_type) - 1)
+    # Initialize plot
+    lmdk_lib.plot_init()
+    # The x axis
+    x_i = np.arange(len(lmdk_n))
+    plt.xticks(x_i, ((lmdk_n/len(seq))*100).astype(int))
+    plt.xlabel('Landmarks (%)')  # Set x axis label.
+    x_margin = bar_width*(len(dist_type)/2 + 1)
+    plt.xlim(x_i.min() - x_margin, x_i.max() + x_margin)
+    # The y axis
+    plt.ylabel('Privacy loss')  # Set y axis label.
+    plt.yscale('log')
+    plt.ylim(epsilon/10, 100*len(seq))
+    # plt.ylim(0, 10000)
+    for d_i, d in enumerate(dist_type):
+      print('.', end='', flush=True)
+      # Initialization
+      e = np.zeros(len(lmdk_n))
+      a = np.zeros(len(lmdk_n))
+      for i, n in enumerate(lmdk_n):
+        for r in range(args.reps):
+          # Generate landmarks
+          lmdks = lmdk_lib.get_lmdks(seq, n, d)
+          # Uniform budget allocation
+          e_cur = lmdk_bgt.uniform(seq, lmdks, epsilon)
+          _, _, a_cur = gdp.tpl_lmdk_mem(e_cur, p, p, seq, lmdks)
+          # Save privacy loss
+          e[i] += np.sum(e_cur)/args.reps
+          a[i] += np.sum(a_cur)/args.reps
+      # Set label
+      label = lmdk_lib.dist_type_to_str(d_i)
+      if d_i == 1:
+        label = 'Skewed'
+      # Plot bar for current distribution
+      plt.bar(
+        x_i + x_offset,
+        a,
+        bar_width,
+        label=label,
+        linewidth=lmdk_lib.line_width
+      )
+      # Change offset for next bar
+      x_offset += bar_width
+    # Plot line for no correlation
+    plt.plot(
+      x_i,
+      e,
+      linewidth=lmdk_lib.line_width,
+      color='#e0e0e0',
+    )
+    # Plot legend
+    lmdk_lib.plot_legend()
+    # Show plot
+    # plt.show()
+    # Save plot
+    lmdk_lib.save_plot(str('../../rslt/dist_cor/' + title + '.pdf'))
+    print(' [OK]', flush=True)
+
+
+'''
+  Parse arguments.
+
+  Optional:
+    reps - The number of repetitions.
+    time - The time limit of the sequence.
+'''
+def parse_args():
+  # Create argument parser.
+  parser = argparse.ArgumentParser()
+
+  # Mandatory arguments.
+
+  # Optional arguments.
+  parser.add_argument('-r', '--reps', help='The number of repetitions.', type=int, default=1)
+  parser.add_argument('-t', '--time', help='The time limit of the sequence.', type=int, default=100)
+
+  # Parse arguments.
+  args = parser.parse_args()
+
+  return args
+
+
+if __name__ == '__main__':
+  try:
+    args = parse_args()
+    start_time = time.time()
+    main(args)
+    end_time = time.time()
+    print('##############################')
+    print('Time elapsed: %s' % (time.strftime('%H:%M:%S', time.gmtime(end_time - start_time))))
+    print('##############################')
+  except KeyboardInterrupt:
+    print('Interrupted by user.')
+    exit()
--- a/code/lib/gdp.py
+++ b/code/lib/gdp.py
--- a/graphics/evaluation/avg-dist.pdf
+++ b/graphics/evaluation/avg-dist.pdf
--- a/graphics/evaluation/dist-cor-mod.pdf
+++ b/graphics/evaluation/dist-cor-mod.pdf
--- a/graphics/evaluation/dist-cor-stg.pdf
+++ b/graphics/evaluation/dist-cor-stg.pdf
--- a/graphics/evaluation/dist-cor-wk.pdf
+++ b/graphics/evaluation/dist-cor-wk.pdf
--- a/rslt/avg_dist/avg-dist.pdf
+++ b/rslt/avg_dist/avg-dist.pdf
--- a/rslt/dist_cor/dist-cor-mod.pdf
+++ b/rslt/dist_cor/dist-cor-mod.pdf
--- a/rslt/dist_cor/dist-cor-stg.pdf
+++ b/rslt/dist_cor/dist-cor-stg.pdf
--- a/rslt/dist_cor/dist-cor-wk.pdf
+++ b/rslt/dist_cor/dist-cor-wk.pdf
--- a/text/evaluation/thething.tex
+++ b/text/evaluation/thething.tex
@ -51,7 +51,8 @@ In general, we can claim that the Adaptive is the most reliable and best perform
 Moreover, designing a data-dependent sampling scheme would possibly result in better results for Adaptive.


-\paragraph{Temporal distance and correlation}
+\subsubsection{Temporal distance and correlation}
+
 Figure~\ref{fig:avg-dist} shows a comparison of the average temporal distance of the events from the previous/next {\thething} or the start/end of the time series for various distributions in synthetic data.
 More particularly, we count for every event the total number of events between itself and the nearest {\thething} or the series edge.
 We observe that the uniform and bimodal distributions tend to limit the regular event--{\thething} distance.
@ -61,33 +62,33 @@ On the contrary, distributing the {\thethings} at one part of the sequence, as i

 \begin{figure}[htp]
  \centering
-  \includegraphics[width=.5\linewidth]{avg-dist}%
+  \includegraphics[width=.5\linewidth]{evaluation/avg-dist}%
  \caption{Average temporal distance of the events from the {\thethings} for different {\thethings} percentages within a time series in various {\thethings} distributions.}
  \label{fig:avg-dist}
 \end{figure}

-Figure~\ref{fig:dist-cor} illustrates a comparison among the aforementioned distributions regarding the overall privacy loss under moderate (Figure~\ref{fig:dist-cor-mod}), and strong (Figure~\ref{fig:dist-cor-stg}) correlation degrees.
+Figure~\ref{fig:dist-cor} illustrates a comparison among the aforementioned distributions regarding the overall privacy loss under (a)~weak, (b)~moderate, and (c)~strong temporal correlation degrees.
 The line shows the overall privacy loss---for all cases of {\thethings} distribution---without temporal correlation.
-We skip the presentation of the results under a weak correlation degree, since they converge in this case.
-In combination with Figure~\ref{fig:avg-dist}, we conclude that a greater average event-{\thething} distance  in a distribution can result into greater overall privacy loss under moderate and strong temporal correlation.
-This is due to the fact that the backward/forward privacy loss accumulates more over time in wider spaces without {\thethings} (see Section~\ref{subsec:correlations}).
+In combination with Figure~\ref{fig:avg-dist}, we conclude that a greater average event--{\thething} distance  in a distribution can result into greater overall privacy loss under moderate and strong temporal correlation.
+This is due to the fact that the backward/forward privacy loss accumulates more over time in wider spaces without {\thethings} (see Section~\ref{sec:correlation}).
 Furthermore, the behavior of the privacy loss is as expected regarding the temporal correlation degree.
 Predictably, a stronger correlation degree generates higher privacy loss while widening the gap between the different distribution cases.
 On the contrary, a weaker correlation degree makes it harder to differentiate among the {\thethings} distributions.
+The privacy loss under a weak correlation degree converge.

 \begin{figure}[htp]
  \centering
  \subcaptionbox{Weak correlation\label{fig:dist-cor-wk}}{%
-    \includegraphics[width=.5\linewidth]{dist-cor-wk}%
+    \includegraphics[width=.5\linewidth]{evaluation/dist-cor-wk}%
  }%
  \hspace{\fill}
  \subcaptionbox{Moderate correlation\label{fig:dist-cor-mod}}{%
-    \includegraphics[width=.5\linewidth]{dist-cor-mod}%
+    \includegraphics[width=.5\linewidth]{evaluation/dist-cor-mod}%
  }%
  \subcaptionbox{Strong correlation\label{fig:dist-cor-stg}}{%
-    \includegraphics[width=.5\linewidth]{dist-cor-stg}%
+    \includegraphics[width=.5\linewidth]{evaluation/dist-cor-stg}%
  }%
-  \caption{Privacy loss for different {\thethings} percentages and distributions, under weak, moderate, and strong degrees of temporal correlation.
+  \caption{Privacy loss for different {\thethings} percentages and distributions, under (a)~weak, (b)~moderate, and (c)~strong degrees of temporal correlation.
  The line shows the overall privacy loss without temporal correlation.}
  \label{fig:dist-cor}
 \end{figure}