expt: Testing epsilon percentages

2021-10-12 23:26:38 +02:00
parent 417ed28513
commit dc42ec6663
7 changed files with 536 additions and 0 deletions
--- a/code/expt/copenhagen-sel-eps.py
+++ b/code/expt/copenhagen-sel-eps.py
@ -0,0 +1,154 @@
+#!/usr/bin/env python3
+
+import sys
+sys.path.insert(1, '../lib')
+import argparse
+import ast
+from datetime import datetime
+from geopy.distance import distance
+import lmdk_bgt
+import lmdk_lib
+import lmdk_sel
+import exp_mech
+import math
+import numpy as np
+from matplotlib import pyplot as plt
+import time
+
+
+def main(args):
+  # Contacts for all users
+  cont_data = lmdk_lib.load_data(args, 'cont')
+  # Contacts for landmark's percentages for all users
+  lmdk_data = lmdk_lib.load_data(args, 'usrs_data')
+  # The name of the dataset
+  d = 'Copenhagen'
+  # The user's id
+  uid = '449'
+  # The landmarks percentages
+  lmdks_pct = [0, 20, 40, 60, 80, 100]
+  # The privacy budget
+  epsilon = 1.0
+  eps_pct = [20, 40, 60, 80]
+
+  markers = [
+    '^', # 20
+    'v', # 40
+    'D', # 60
+    's'  # 80
+  ]
+
+  print('\n##############################', d, '\n')
+  # Get user's contacts sequence
+  seq = cont_data[cont_data[:, 1] == float(uid)][:1000]
+
+  # Initialize plot
+  lmdk_lib.plot_init()
+  # The x axis
+  x_i = np.arange(len(lmdks_pct))
+  plt.xticks(x_i, np.array(lmdks_pct, int))
+  plt.xlabel('Landmarks (%)')  # Set x axis label.
+  plt.xlim(x_i.min(), x_i.max())
+  # The y axis
+  plt.ylabel('Mean absolute error (%)')  # Set y axis label.
+  # plt.yscale('log')
+  plt.ylim(0, 100)
+
+  mae_evt = 0
+  mae_usr = 0
+
+  for i_e, e in enumerate(eps_pct):
+    mae = np.zeros(len(lmdks_pct))
+
+    for i, pct in enumerate(lmdks_pct):
+      # Find landmarks
+      lmdks = lmdk_lib.find_lmdks_cont(lmdk_data, seq, uid, pct)
+
+      for _ in range(args.iter):
+
+        lmdks_sel = lmdk_sel.find_lmdks_eps(seq, lmdks, epsilon*e/100)
+
+        # Uniform
+        rls_data, _ = lmdk_bgt.uniform_cont(seq, lmdks_sel, epsilon*(1 - e/100))
+        mae[i] += (lmdk_bgt.mae_cont(rls_data)/args.iter)*100
+
+        # Calculate once
+        if e == eps_pct[0] and pct == lmdks_pct[0]:
+          # Event
+          rls_data_evt, _ = lmdk_bgt.uniform_cont(seq, lmdks, epsilon)
+          mae_evt += (lmdk_bgt.mae_cont(rls_data_evt)/args.iter)*100
+        elif e == eps_pct[-1] and pct == lmdks_pct[-1]:
+          # User
+          rls_data_usr, _ = lmdk_bgt.uniform_cont(seq, lmdks, epsilon)
+          mae_usr += (lmdk_bgt.mae_cont(rls_data_usr)/args.iter)*100
+
+    # Plot line
+    plt.plot(
+      x_i,
+      mae,
+      label=str(e/100) + 'ε',
+      marker=markers[i_e],
+      markersize=lmdk_lib.marker_size,
+      markeredgewidth=0,
+      linewidth=lmdk_lib.line_width
+    )
+
+
+  plt.axhline(
+    y = mae_evt,
+    color = '#212121',
+    linewidth=lmdk_lib.line_width
+  )
+  plt.text(x_i[-1] + x_i[-1]*.01, mae_evt - mae_evt*.05, 'event')
+
+  plt.axhline(
+    y = mae_usr,
+    color = '#616161',
+    linewidth=lmdk_lib.line_width
+  )
+  plt.text(x_i[-1] + x_i[-1]*.01, mae_usr - mae_usr*.05, 'user')
+
+  path = str('../../rslt/lmdk_sel_eps/' + d)
+  # Plot legend
+  lmdk_lib.plot_legend()
+  # # Show plot
+  # plt.show()
+  # Save plot
+  lmdk_lib.save_plot(path + '-sel-eps.pdf')
+  print('[OK]', flush=True)
+
+
+def parse_args():
+  '''
+    Parse arguments.
+
+    Optional:
+      res  - The results archive file.
+      iter - The total iterations.
+  '''
+  # Create argument parser.
+  parser = argparse.ArgumentParser()
+
+  # Mandatory arguments.
+
+  # Optional arguments.
+  parser.add_argument('-r', '--res', help='The results archive file.', type=str, default='/home/manos/Cloud/Data/Copenhagen/Results.zip')
+  parser.add_argument('-i', '--iter', help='The total iterations.', type=int, default=1)
+
+  # Parse arguments.
+  args = parser.parse_args()
+
+  return args
+
+
+if __name__ == '__main__':
+  try:
+    start_time = time.time()
+    main(parse_args())
+    end_time = time.time()
+    print('##############################')
+    print('Time elapsed: %s' % (time.strftime('%H:%M:%S', time.gmtime(end_time - start_time))))
+    print('##############################')
+  except KeyboardInterrupt:
+    print('Interrupted by user.')
+    exit()
--- a/code/expt/hue-sel-eps.py
+++ b/code/expt/hue-sel-eps.py
@ -0,0 +1,149 @@
+#!/usr/bin/env python3
+
+import sys
+sys.path.insert(1, '../lib')
+import argparse
+import ast
+from datetime import datetime
+from geopy.distance import distance
+import lmdk_bgt
+import lmdk_lib
+import lmdk_sel
+import exp_mech
+import math
+import numpy as np
+from matplotlib import pyplot as plt
+import time
+
+
+def main(args):
+  # User's consumption
+  seq = lmdk_lib.load_data(args, 'cons')
+  # The name of the dataset
+  d = 'HUE'
+  # The landmarks percentages
+  lmdks_pct = [0, 20, 40, 60, 80, 100]
+  # Landmarks' thresholds
+  lmdks_th = [0, .54, .68, .88, 1.12, 10]
+  # The privacy budget
+  epsilon = 1.0
+  eps_pct = [20, 40, 60, 80]
+
+  markers = [
+    '^', # 20
+    'v', # 40
+    'D', # 60
+    's'  # 80
+  ]
+
+  print('\n##############################', d, '\n')
+
+  # Initialize plot
+  lmdk_lib.plot_init()
+  # The x axis
+  x_i = np.arange(len(lmdks_pct))
+  plt.xticks(x_i, np.array(lmdks_pct, int))
+  plt.xlabel('Landmarks (%)')  # Set x axis label.
+  plt.xlim(x_i.min(), x_i.max())
+  # The y axis
+  plt.ylabel('Mean absolute error (kWh)')  # Set y axis label.
+  plt.yscale('log')
+  plt.ylim(.1, 100000)
+
+  mae_evt = 0
+  mae_usr = 0
+
+  for i_e, e in enumerate(eps_pct):
+    mae = np.zeros(len(lmdks_pct))
+
+    for i, pct in enumerate(lmdks_pct):
+      # Find landmarks
+      lmdks = seq[seq[:, 1] < lmdks_th[i]]
+
+      for _ in range(args.iter):
+
+        lmdks = lmdk_sel.find_lmdks_eps(seq, lmdks, epsilon*e/100)
+
+        # Uniform
+        rls_data, _ = lmdk_bgt.uniform_cons(seq, lmdks, epsilon*(1 - e/100))
+        mae[i] += lmdk_bgt.mae_cons(seq, rls_data)/args.iter
+
+        # Calculate once
+        if e == eps_pct[0] and pct == lmdks_pct[0]:
+          # Event
+          rls_data_evt, _ = lmdk_bgt.uniform_cons(seq, lmdks, epsilon)
+          mae_evt += lmdk_bgt.mae_cons(seq, rls_data_evt)/args.iter
+        elif e == eps_pct[-1] and pct == lmdks_pct[-1]:
+          # User
+          rls_data_usr, _ = lmdk_bgt.uniform_cons(seq, lmdks, epsilon)
+          mae_usr += lmdk_bgt.mae_cons(seq, rls_data_usr)/args.iter
+
+    # Plot line
+    plt.plot(
+      x_i,
+      mae,
+      label=str(e/100) + 'ε',
+      marker=markers[i_e],
+      markersize=lmdk_lib.marker_size,
+      markeredgewidth=0,
+      linewidth=lmdk_lib.line_width
+    )
+
+  plt.axhline(
+    y = mae_evt,
+    color = '#212121',
+    linewidth=lmdk_lib.line_width
+  )
+  plt.text(x_i[-1] + x_i[-1]*.01, mae_evt - mae_evt*.14, 'event')
+
+  plt.axhline(
+    y = mae_usr,
+    color = '#616161',
+    linewidth=lmdk_lib.line_width
+  )
+  plt.text(x_i[-1] + x_i[-1]*.01, mae_usr - mae_usr*.14, 'user')
+
+  path = str('../../rslt/lmdk_sel_eps/' + d)
+  # Plot legend
+  lmdk_lib.plot_legend()
+  # Show plot
+  # plt.show()
+  # Save plot
+  lmdk_lib.save_plot(path + '-sel-eps.pdf')
+  print('[OK]', flush=True)
+
+
+def parse_args():
+  '''
+    Parse arguments.
+
+    Optional:
+      res  - The results archive file.
+      iter - The total iterations.
+  '''
+  # Create argument parser.
+  parser = argparse.ArgumentParser()
+
+  # Mandatory arguments.
+
+  # Optional arguments.
+  parser.add_argument('-r', '--res', help='The results archive file.', type=str, default='/home/manos/Cloud/Data/HUE/Results.zip')
+  parser.add_argument('-i', '--iter', help='The total iterations.', type=int, default=1)
+
+  # Parse arguments.
+  args = parser.parse_args()
+
+  return args
+
+
+if __name__ == '__main__':
+  try:
+    start_time = time.time()
+    main(parse_args())
+    end_time = time.time()
+    print('##############################')
+    print('Time elapsed: %s' % (time.strftime('%H:%M:%S', time.gmtime(end_time - start_time))))
+    print('##############################')
+  except KeyboardInterrupt:
+    print('Interrupted by user.')
+    exit()
--- a/code/expt/t-drive-sel-eps.py
+++ b/code/expt/t-drive-sel-eps.py
@ -0,0 +1,179 @@
+#!/usr/bin/env python3
+
+import sys
+sys.path.insert(1, '../lib')
+import argparse
+from datetime import datetime
+from geopy.distance import distance
+import lmdk_bgt
+import lmdk_lib
+import lmdk_sel
+import exp_mech
+import numpy as np
+from matplotlib import pyplot as plt
+import time
+
+
+def main(args):
+  # The data files
+  data_files = {
+    'T-drive': '/home/manos/Cloud/Data/T-drive/Results.zip',
+  }
+  # Data related info
+  data_info = {
+    'T-drive': {
+      'uid': 2,
+      'lmdks': {
+          0: {'dist': 0, 'per': 1000},   #   0.0%
+         20: {'dist': 2095, 'per': 30},  #  19.6%
+         40: {'dist': 2790, 'per': 30},  #  40.2%
+         60: {'dist': 3590, 'per': 30},  #  59.9%
+         80: {'dist': 4825, 'per': 30},  #  79.4%
+        100: {'dist': 10350, 'per': 30}  # 100.0%
+      }
+    }
+  }
+  # The data sets
+  data_sets = {}
+  # Load data sets
+  for df in data_files:
+    args.res = data_files[df]
+    data_sets[df] = lmdk_lib.load_data(args, 'usrs_data')
+  # Geo-I configuration
+  # epsilon = level/radius
+  # Radius is in meters
+  bgt_conf = [
+    {'epsilon': 1},
+  ]
+  eps_pct = [20, 40, 60, 80]
+
+  markers = [
+    '^', # 20
+    'v', # 40
+    'D', # 60
+    's'  # 80
+  ]
+
+  # The x axis
+  x_i = np.arange(len(list(data_info.values())[0]['lmdks']))
+
+  for d in data_sets:
+    print('\n##############################', d, '\n')
+    args.res = data_files[d]
+    data = data_sets[d]
+    # Truncate trajectory according to arguments
+    seq = data[data[:,0]==data_info[d]['uid'], :][:args.time]
+
+    # Initialize plot
+    lmdk_lib.plot_init()
+    # The x axis
+    plt.xticks(x_i, np.array([key for key in data_info[d]['lmdks']]).astype(int))
+    plt.xlabel('Landmarks (%)')  # Set x axis label.
+    plt.xlim(x_i.min(), x_i.max())
+    # The y axis
+    plt.ylabel('Mean absolute error (m)')  # Set y axis label.
+    plt.yscale('log')
+    plt.ylim(1, 1000000)
+
+    mae_evt = 0
+    mae_usr = 0
+
+    for i_e, e in enumerate(eps_pct):
+      mae = np.zeros(len(data_info[d]['lmdks']))
+      for i, lmdk in enumerate(data_info[d]['lmdks']):
+        # Find landmarks
+        args.dist = data_info[d]['lmdks'][lmdk]['dist']
+        args.per = data_info[d]['lmdks'][lmdk]['per']
+        lmdks = lmdk_lib.find_lmdks(seq, args)[:args.time]
+        for bgt in bgt_conf:
+          for _ in range(args.iter):
+
+            lmdks = lmdk_sel.find_lmdks_eps(seq, lmdks, bgt['epsilon']*e/100)
+
+            # Uniform
+            rls_data_u, _ = lmdk_bgt.uniform_r(seq, lmdks, bgt['epsilon']*(1 - e/100))
+            mae[i] += lmdk_bgt.mae(seq, rls_data_u)/args.iter
+
+            # Calculate once
+            if e == eps_pct[0] and lmdk == min(data_info[d]['lmdks']):
+              # Event
+              rls_data_evt, _ = lmdk_bgt.uniform_r(seq, lmdks, bgt['epsilon'])
+              mae_evt += lmdk_bgt.mae(seq, rls_data_evt)/args.iter
+            elif e == eps_pct[-1] and lmdk == max(data_info[d]['lmdks']):
+              # User
+              rls_data_usr, _ = lmdk_bgt.uniform_r(seq, lmdks, bgt['epsilon'])
+              mae_usr += lmdk_bgt.mae(seq, rls_data_usr)/args.iter
+
+      # Plot line
+      plt.plot(
+        x_i,
+        mae,
+        label=str(e/100) + 'ε',
+        marker=markers[i_e],
+        markersize=lmdk_lib.marker_size,
+        markeredgewidth=0,
+        linewidth=lmdk_lib.line_width
+      )
+
+    plt.axhline(
+      y = mae_evt,
+      color = '#212121',
+      linewidth=lmdk_lib.line_width
+    )
+    plt.text(x_i[-1] + x_i[-1]*.01, mae_evt - mae_evt*.05, 'event')
+
+    plt.axhline(
+      y = mae_usr,
+      color = '#616161',
+      linewidth=lmdk_lib.line_width
+    )
+    plt.text(x_i[-1] + x_i[-1]*.01, mae_usr - mae_usr*.05, 'user')
+
+    path = str('../../rslt/lmdk_sel_eps/' + d)
+    # Plot legend
+    lmdk_lib.plot_legend()
+    # # Show plot
+    # plt.show()
+    # Save plot
+    lmdk_lib.save_plot(path + '-sel-eps.pdf')
+
+
+def parse_args():
+  '''
+    Parse arguments.
+
+    Optional:
+      dist - The coordinates distance threshold in meters.
+      per  - The timestaps period threshold in mimutes.
+      time - The total timestamps.
+      iter - The total iterations.
+  '''
+  # Create argument parser.
+  parser = argparse.ArgumentParser()
+
+  # Mandatory arguments.
+
+  # Optional arguments.
+  parser.add_argument('-l', '--dist', help='The coordinates distance threshold in meters.', type=int, default=200)
+  parser.add_argument('-p', '--per', help='The timestaps period threshold in mimutes.', type=int, default=30)
+  parser.add_argument('-r', '--res', help='The results archive file.', type=str, default='/home/manos/Cloud/Data/T-drive/Results.zip')
+  parser.add_argument('-t', '--time', help='The total timestamps.', type=int, default=1000)
+  parser.add_argument('-i', '--iter', help='The total iterations.', type=int, default=1)
+
+  # Parse arguments.
+  args = parser.parse_args()
+
+  return args
+
+
+if __name__ == '__main__':
+  try:
+    start_time = time.time()
+    main(parse_args())
+    end_time = time.time()
+    print('##############################')
+    print('Time elapsed: %s' % (time.strftime('%H:%M:%S', time.gmtime(end_time - start_time))))
+    print('##############################')
+  except KeyboardInterrupt:
+    print('Interrupted by user.')
+    exit()
--- a/code/lib/lmdk_sel.py
+++ b/code/lib/lmdk_sel.py
@ -391,6 +391,60 @@ def find_lmdks(seq, lmdks, epsilon):
    lmdks_new = seq[lmdks_seq_new - 1]
  return lmdks_new, epsilon - eps_sel

+
+def find_lmdks_eps(seq, lmdks, epsilon):
+  '''
+    Add dummy landmarks to original landmarks.
+
+    Parameters:
+      seq     - All of the data points.
+      lmdks   - The original landmarks.
+      epsilon - The available privacy budget.
+
+    Returns:
+      lmdks_new - The new landmarks.
+  '''
+  # The new landmarks
+  lmdks_new = lmdks
+  if len(lmdks) > 0 and len(seq) != len(lmdks):
+    # Get landmarks timestamps in sequence
+    lmdks_seq = find_lmdks_seq(seq, lmdks)
+    # Turn landmarks to histogram
+    hist, h = get_hist(get_seq(1, len(seq)), lmdks_seq)
+    # Find all possible options
+    opts = get_opts_from_top_h(get_seq(1, len(seq)), lmdks_seq)
+    # Get landmarks histogram with dummy landmarks
+    hist_new, _ = exp_mech.exponential(hist, opts, exp_mech.score, 1.0, epsilon)
+    # Split sequence in parts of size h 
+    pt_idx = []
+    for idx in range(1, len(seq), h):
+      pt_idx.append([idx, idx + h - 1])
+    pt_idx[-1][1] = len(seq)
+    # Get new landmarks indexes
+    lmdks_seq_new = np.array([], dtype=int)
+    for i, pt in enumerate(pt_idx):
+      # Already landmarks
+      lmdks_seq_pt = lmdks_seq[(lmdks_seq >= pt[0]) & (lmdks_seq <= pt[1])]
+      # Sample randomly from the rest of the sequence
+      size = hist_new[i] - len(lmdks_seq_pt)
+      rglr = np.setdiff1d(np.arange(pt[0], pt[1] + 1), lmdks_seq_pt)
+      # Add already landmarks
+      lmdks_seq_new = np.concatenate([lmdks_seq_new, lmdks_seq_pt])
+      # Add new landmarks
+      if size > 0 and len(rglr) > size:
+        lmdks_seq_new = np.concatenate([lmdks_seq_new,
+          np.random.choice(
+            rglr, 
+            size = size, 
+            replace = False
+          )
+        ])
+    # Get actual landmarks values
+    lmdks_new = seq[lmdks_seq_new - 1]
+  return lmdks_new
+
+
+
 def test():
  # Start and end points of the sequence
  # # Nonrandom
--- a/rslt/lmdk_sel_eps/Copenhagen-sel-eps.pdf
+++ b/rslt/lmdk_sel_eps/Copenhagen-sel-eps.pdf
--- a/rslt/lmdk_sel_eps/HUE-sel-eps.pdf
+++ b/rslt/lmdk_sel_eps/HUE-sel-eps.pdf
--- a/rslt/lmdk_sel_eps/T-drive-sel-eps.pdf
+++ b/rslt/lmdk_sel_eps/T-drive-sel-eps.pdf