lmdk-expt: Reviewed all graphs for synthetic

2021-10-09 13:27:16 +02:00
parent edb98f736d
commit b03b510f02
12 changed files with 1839 additions and 10 deletions
--- a/code/expt/avg_dist.py
+++ b/code/expt/avg_dist.py
@ -0,0 +1,121 @@
 #!/usr/bin/env python3
 import sys
 sys.path.insert(1, '../lib')
 import argparse
 import gdp
 import lmdk_lib
 import math
 from matplotlib import pyplot as plt
 import numpy as np
 import os
 import time
 def main(args):
  # Number of timestamps
  seq = lmdk_lib.get_seq(1, args.time)
  # Distribution type
  dist_type = np.array(range(0, 4))
  # Number of landmarks
  lmdk_n = np.array(range(0, args.time + 1, int(args.time/5)))
  markers = [
    '^', # Symmetric
    'v', # Skewed
    'D', # Bimodal
    's'  # Uniform
  ]
  # Initialize plot
  lmdk_lib.plot_init()
  # The x axis
  x_i = np.arange(len(lmdk_n))
  plt.xticks(x_i, ((lmdk_n/len(seq))*100).astype(int))
  plt.xlabel('Landmarks (%)')  # Set x axis label.
  plt.xlim(x_i.min(), x_i.max())
  # The y axis
  plt.ylabel('Normalized average distance')  # Set y axis label.
  plt.yscale('log')
  plt.ylim(.001, 1)
  # Logging
  print('Average distance', end='', flush=True)
  for d_i, d in enumerate(dist_type):
    avg_dist = np.zeros(len(lmdk_n))
    # Logging
    print('.', end='', flush=True)
    for i, n in enumerate(lmdk_n):
      for r in range(args.reps):
        # Generate landmarks
        lmdks = lmdk_lib.get_lmdks(seq, n, d)
        # Calculate average distance
        avg_cur = 0
        for t in seq:
          t_prv, t_nxt = gdp.get_limits(t, seq, lmdks)
          avg_cur += (abs(t - t_prv) - 1 + abs(t - t_nxt) - 1 )/len(seq)
        # Normalized average based on repetitions
        avg_dist[i] += avg_cur/args.reps
    # Rescaling (min-max normalization)
    # https://en.wikipedia.org/wiki/Feature_scaling#Rescaling_(min-max_normalization)
    avg_dist = (avg_dist - avg_dist.min())/(avg_dist.max() - avg_dist.min())
    # Normalize for log scale
    if avg_dist[len(avg_dist) - 1] == 0:
      avg_dist[len(avg_dist) - 1] = .001
    # Set label
    label = lmdk_lib.dist_type_to_str(d_i)
    if d_i == 1:
      label = 'Skewed'
    # Plot line
    plt.plot(
      x_i,
      avg_dist,
      label=label,
      marker=markers[d_i],
      markersize=lmdk_lib.marker_size,
      markeredgewidth=0,
      linewidth=lmdk_lib.line_width
    )
  # Plot legend
  lmdk_lib.plot_legend()
  # Show plot
  # plt.show()
  # Save plot
  lmdk_lib.save_plot(str('../../rslt/avg_dist/' + 'avg-dist' + '.pdf'))
  print(' [OK]', flush=True)
 '''
  Parse arguments.
  Optional:
    reps - The number of repetitions.
    time - The time limit of the sequence.
 '''
 def parse_args():
  # Create argument parser.
  parser = argparse.ArgumentParser()
  # Mandatory arguments.
  # Optional arguments.
  parser.add_argument('-r', '--reps', help='The number of repetitions.', type=int, default=1)
  parser.add_argument('-t', '--time', help='The time limit of the sequence.', type=int, default=100)
  # Parse arguments.
  args = parser.parse_args()
  return args
 if __name__ == '__main__':
  try:
    args = parse_args()
    start_time = time.time()
    main(args)
    end_time = time.time()
    print('##############################')
    print('Time elapsed: %s' % (time.strftime('%H:%M:%S', time.gmtime(end_time - start_time))))
    print('##############################')
  except KeyboardInterrupt:
    print('Interrupted by user.')
    exit()
--- a/code/expt/dist_cor.py
+++ b/code/expt/dist_cor.py
@ -0,0 +1,131 @@
 #!/usr/bin/env python3
 import sys
 sys.path.insert(1, '../lib')
 import argparse
 import gdp
 import itertools
 import lmdk_bgt
 import lmdk_lib
 import numpy as np
 import os
 from matplotlib import pyplot as plt
 import time
 def main(args):
  # Privacy goal
  epsilon = 1.0
  # Number of timestamps
  seq = lmdk_lib.get_seq(1, args.time)
  # Correlation degree (higher values means weaker correlations)
  cor_deg = np.array([.01, .1, 1.0])
  cor_lbl = ['Strong correlation', 'Moderate correlation', 'Weak correlation']
  # Distribution type
  dist_type = np.array(range(0, 4))
  # Number of landmarks
  lmdk_n = np.array(range(0, args.time + 1, int(args.time/5)))
  # Width of bars
  bar_width = 1/(len(dist_type) + 1)
  # For each correlation degree
  for c_i, c in enumerate(cor_deg):
    # Logging
    title = cor_lbl[c_i]
    print('(%d/%d) %s' %(c_i + 1, len(cor_deg), title), end='', flush=True)
    # The transition matrix
    p = gdp.gen_trans_mt(2, c)
    # Bar offset
    x_offset = -(bar_width/2)*(len(dist_type) - 1)
    # Initialize plot
    lmdk_lib.plot_init()
    # The x axis
    x_i = np.arange(len(lmdk_n))
    plt.xticks(x_i, ((lmdk_n/len(seq))*100).astype(int))
    plt.xlabel('Landmarks (%)')  # Set x axis label.
    x_margin = bar_width*(len(dist_type)/2 + 1)
    plt.xlim(x_i.min() - x_margin, x_i.max() + x_margin)
    # The y axis
    plt.ylabel('Privacy loss')  # Set y axis label.
    plt.yscale('log')
    plt.ylim(epsilon/10, 100*len(seq))
    # plt.ylim(0, 10000)
    for d_i, d in enumerate(dist_type):
      print('.', end='', flush=True)
      # Initialization
      e = np.zeros(len(lmdk_n))
      a = np.zeros(len(lmdk_n))
      for i, n in enumerate(lmdk_n):
        for r in range(args.reps):
          # Generate landmarks
          lmdks = lmdk_lib.get_lmdks(seq, n, d)
          # Uniform budget allocation
          e_cur = lmdk_bgt.uniform(seq, lmdks, epsilon)
          _, _, a_cur = gdp.tpl_lmdk_mem(e_cur, p, p, seq, lmdks)
          # Save privacy loss
          e[i] += np.sum(e_cur)/args.reps
          a[i] += np.sum(a_cur)/args.reps
      # Set label
      label = lmdk_lib.dist_type_to_str(d_i)
      if d_i == 1:
        label = 'Skewed'
      # Plot bar for current distribution
      plt.bar(
        x_i + x_offset,
        a,
        bar_width,
        label=label,
        linewidth=lmdk_lib.line_width
      )
      # Change offset for next bar
      x_offset += bar_width
    # Plot line for no correlation
    plt.plot(
      x_i,
      e,
      linewidth=lmdk_lib.line_width,
      color='#e0e0e0',
    )
    # Plot legend
    lmdk_lib.plot_legend()
    # Show plot
    # plt.show()
    # Save plot
    lmdk_lib.save_plot(str('../../rslt/dist_cor/' + title + '.pdf'))
    print(' [OK]', flush=True)
 '''
  Parse arguments.
  Optional:
    reps - The number of repetitions.
    time - The time limit of the sequence.
 '''
 def parse_args():
  # Create argument parser.
  parser = argparse.ArgumentParser()
  # Mandatory arguments.
  # Optional arguments.
  parser.add_argument('-r', '--reps', help='The number of repetitions.', type=int, default=1)
  parser.add_argument('-t', '--time', help='The time limit of the sequence.', type=int, default=100)
  # Parse arguments.
  args = parser.parse_args()
  return args
 if __name__ == '__main__':
  try:
    args = parse_args()
    start_time = time.time()
    main(args)
    end_time = time.time()
    print('##############################')
    print('Time elapsed: %s' % (time.strftime('%H:%M:%S', time.gmtime(end_time - start_time))))
    print('##############################')
  except KeyboardInterrupt:
    print('Interrupted by user.')
    exit()
--- a/code/lib/gdp.py
+++ b/code/lib/gdp.py
--- a/graphics/evaluation/avg-dist.pdf
+++ b/graphics/evaluation/avg-dist.pdf
--- a/graphics/evaluation/dist-cor-mod.pdf
+++ b/graphics/evaluation/dist-cor-mod.pdf
--- a/graphics/evaluation/dist-cor-stg.pdf
+++ b/graphics/evaluation/dist-cor-stg.pdf
--- a/graphics/evaluation/dist-cor-wk.pdf
+++ b/graphics/evaluation/dist-cor-wk.pdf
--- a/rslt/avg_dist/avg-dist.pdf
+++ b/rslt/avg_dist/avg-dist.pdf
--- a/rslt/dist_cor/dist-cor-mod.pdf
+++ b/rslt/dist_cor/dist-cor-mod.pdf
--- a/rslt/dist_cor/dist-cor-stg.pdf
+++ b/rslt/dist_cor/dist-cor-stg.pdf
--- a/rslt/dist_cor/dist-cor-wk.pdf
+++ b/rslt/dist_cor/dist-cor-wk.pdf
--- a/text/evaluation/thething.tex
+++ b/text/evaluation/thething.tex
@ -51,7 +51,8 @@ In general, we can claim that the Adaptive is the most reliable and best perform
 Moreover, designing a data-dependent sampling scheme would possibly result in better results for Adaptive.
-\paragraph{Temporal distance and correlation}
+\subsubsection{Temporal distance and correlation}
 Figure~\ref{fig:avg-dist} shows a comparison of the average temporal distance of the events from the previous/next {\thething} or the start/end of the time series for various distributions in synthetic data.
 More particularly, we count for every event the total number of events between itself and the nearest {\thething} or the series edge.
 We observe that the uniform and bimodal distributions tend to limit the regular event--{\thething} distance.
@ -61,33 +62,33 @@ On the contrary, distributing the {\thethings} at one part of the sequence, as i
 \begin{figure}[htp]
  \centering
-  \includegraphics[width=.5\linewidth]{avg-dist}%
+  \includegraphics[width=.5\linewidth]{evaluation/avg-dist}%
  \caption{Average temporal distance of the events from the {\thethings} for different {\thethings} percentages within a time series in various {\thethings} distributions.}
  \label{fig:avg-dist}
 \end{figure}
-Figure~\ref{fig:dist-cor} illustrates a comparison among the aforementioned distributions regarding the overall privacy loss under moderate (Figure~\ref{fig:dist-cor-mod}), and strong (Figure~\ref{fig:dist-cor-stg}) correlation degrees.
+Figure~\ref{fig:dist-cor} illustrates a comparison among the aforementioned distributions regarding the overall privacy loss under (a)~weak, (b)~moderate, and (c)~strong temporal correlation degrees.
 The line shows the overall privacy loss---for all cases of {\thethings} distribution---without temporal correlation.
-We skip the presentation of the results under a weak correlation degree, since they converge in this case.
+In combination with Figure~\ref{fig:avg-dist}, we conclude that a greater average event--{\thething} distance  in a distribution can result into greater overall privacy loss under moderate and strong temporal correlation.
-In combination with Figure~\ref{fig:avg-dist}, we conclude that a greater average event-{\thething} distance  in a distribution can result into greater overall privacy loss under moderate and strong temporal correlation.
+This is due to the fact that the backward/forward privacy loss accumulates more over time in wider spaces without {\thethings} (see Section~\ref{sec:correlation}).
 This is due to the fact that the backward/forward privacy loss accumulates more over time in wider spaces without {\thethings} (see Section~\ref{subsec:correlations}).
 Furthermore, the behavior of the privacy loss is as expected regarding the temporal correlation degree.
 Predictably, a stronger correlation degree generates higher privacy loss while widening the gap between the different distribution cases.
 On the contrary, a weaker correlation degree makes it harder to differentiate among the {\thethings} distributions.
 The privacy loss under a weak correlation degree converge.
 \begin{figure}[htp]
  \centering
  \subcaptionbox{Weak correlation\label{fig:dist-cor-wk}}{%
-    \includegraphics[width=.5\linewidth]{dist-cor-wk}%
+    \includegraphics[width=.5\linewidth]{evaluation/dist-cor-wk}%
  }%
  \hspace{\fill}
  \subcaptionbox{Moderate correlation\label{fig:dist-cor-mod}}{%
-    \includegraphics[width=.5\linewidth]{dist-cor-mod}%
+    \includegraphics[width=.5\linewidth]{evaluation/dist-cor-mod}%
  }%
  \subcaptionbox{Strong correlation\label{fig:dist-cor-stg}}{%
-    \includegraphics[width=.5\linewidth]{dist-cor-stg}%
+    \includegraphics[width=.5\linewidth]{evaluation/dist-cor-stg}%
  }%
-  \caption{Privacy loss for different {\thethings} percentages and distributions, under weak, moderate, and strong degrees of temporal correlation.
+  \caption{Privacy loss for different {\thethings} percentages and distributions, under (a)~weak, (b)~moderate, and (c)~strong degrees of temporal correlation.
  The line shows the overall privacy loss without temporal correlation.}
  \label{fig:dist-cor}
 \end{figure}