#!/usr/bin/env python3 from datetime import datetime from geopy.distance import distance import math import numpy as np import random import sympy as sp import time import lmdk_lib from matplotlib import pyplot as plt def draw_bgts(title, bgts): ''' Plots the allocated budget. Parameters: title - The title of the plot. bgts - The allocated privacy budget. Returns: Nothing. ''' lmdk_lib.plot_init() p = np.arange(1, len(bgts) + 1, 1) plt.plot( p, bgts, linewidth=lmdk_lib.line_width, markersize=lmdk_lib.marker_size, markeredgewidth=0, label=r'$\varepsilon$', marker='s' ) lmdk_lib.plot_legend() # Set plot box plt.axis([1, len(bgts), .0, max(bgts) + max(bgts)/2]) # Set x axis label plt.xlabel('Timestamp') # Set y axis label plt.ylabel('Privacy budget') plt.title(title.replace('_', '-')) plt.show() def validate_bgts(seq, lmdks, epsilon, bgts): ''' Budget allocation validation. Parameters: seq - The point sequence. lmdks - The landmarks. epsilon - The available privacy budget. bgts - The privacy budget allocation. Returns: The index of the privacy budget that caused the failure or -1 if successful. ''' bgts_sum = .0 # Landmarks for i, p in enumerate(seq): if np.any(lmdks[:] == p): bgts_sum += bgts[i] # Regular events bgts_max = bgts_sum for i, p in enumerate(seq): if not np.any(lmdks[:] == p): bgts_cur = bgts_sum + bgts[i] if bgts_cur > bgts_max: bgts_max = bgts_cur if bgts_cur > epsilon and not math.isclose(bgts_cur, epsilon, rel_tol=.001): print('Budget allocation validation failed: %.2f%% (%.4f/%.4f)' %(100*bgts_max/epsilon, bgts_max, epsilon)) return i print( 'Landmark budget allocation: %.2f%% (%.4f/%.4f)\n' 'Overall budget allocation : %.2f%% (%.4f/%.4f)\n' %(100*bgts_max/epsilon, bgts_max, epsilon, 100*sum(bgts)/epsilon, sum(bgts), epsilon,)) return -1 def uniform(seq, lmdks, epsilon): ''' Uniform budget allocation. Parameters: seq - The point sequence. lmdks - The landmarks. epsilon - The available privacy budget. Returns: bgts - The privacy budget allocation. ''' bgts = np.zeros(len(seq)) # Allocate enough budget for all landmarks and one regular point k = len(lmdks) + 1 # All points are landmarks if k > len(seq): k = len(lmdks) # Allocate the budget for i, _ in enumerate(bgts): bgts[i] = epsilon/k return bgts def geometric(seq, lmdks, epsilon): ''' Geometric budget allocation. Parameters: seq - The point sequence. lmdks - The landmarks. epsilon - The available privacy budget. Returns: bgts - The privacy budget distribution. ''' bgts = np.zeros([len(seq)]) epsilon_avail = epsilon # Landmarks for i, p in enumerate(seq): if np.any(lmdks[:] == p): bgts[i] = epsilon_avail/2 epsilon_avail = epsilon_avail - epsilon_avail/2 # Regular events for i, p in enumerate(seq): if not np.any(lmdks[:] == p): bgts[i] = epsilon_avail return bgts def exponential(seq, lmdks, epsilon): ''' Exponential uniform budget allocation (max to user-level). Parameters: seq - The point sequence. lmdks - The landmarks. epsilon - The available privacy budget. Returns: bgts - The privacy budget allocation. ''' # Fallback to uniform if zero or max landmarks if len(lmdks) == 0 or len(lmdks) == len(seq): return uniform(seq, lmdks, epsilon) # # In case of seq == lmdks # l = 0 # N = epsilon/len(seq) # Otherwise bgts = np.zeros([len(seq)]) if len(seq) != len(lmdks): # Find worst case regural point p_sel = 0 for p in seq: if not np.any(lmdks[:] == p): p_sel = p break # List all landmark timestamps with the worst regular points = np.append(lmdks, [p_sel]) points = np.sort(points) # epsilon_t = N*e^-l*t l = sp.symbols('l', real=True) # Cumulative allocation at landmarks and one extra point, i.e., L union {t} T = seq.max() # Bounding the privacy budgets at L union {t} # epsilon = sum(N*e^-l*t) for t in L union {t} t = sp.symbols('t') eq = 0 for t in points: eq += (((epsilon/len(seq))/(sp.exp(-1*l*T)))*sp.exp(-1*l*t)) try: l = sp.solve(eq - epsilon, simplify=False, rational=False)[0] # l = sp.solveset(eq <= epsilon, l, sp.S.Reals).sup except Exception: return bgts # Allocate to the last point T epsilon/len(seq), i.e., user-level N = (epsilon/len(seq))/sp.exp(-1*l*T) # Allocate the budget for i, t in enumerate(seq): bgts[i] = N*sp.exp(-1*l*t) return bgts def linear_zero(seq, lmdks, epsilon): ''' Linear uniform budget allocation (max to zero). Parameters: seq - The point sequence. lmdks - The landmarks. epsilon - The available privacy budget. Returns: bgts - The privacy budget allocation. ''' bgts = np.zeros([len(seq)]) # Find worst case regural point p_sel = 0 for p in seq: if not np.any(lmdks[:] == p): p_sel = p break # Sum all landmark timestamps with the worst regular sum_cur = p_sel for p in lmdks: sum_cur += p # epsilon_t = a*t + b b = sp.symbols('b') # Cumulative allocation at landmarks and one extra point T = seq[len(seq) - 1] b = sp.solve(b - ((b/T)*sum_cur + epsilon)/(len(lmdks) + 1))[0] # Allocate to the last point 0 a = -b/T # Allocate the budget for i, t in enumerate(seq): bgts[i] = a*t + b return bgts def linear(seq, lmdks, epsilon): ''' Linear uniform budget allocation (max to user-level). Parameters: seq - The point sequence. lmdks - The landmarks. epsilon - The available privacy budget. Returns: bgts - The privacy budget allocation. ''' # Fallback to uniform if zero or max landmarks if len(lmdks) == 0 or len(lmdks) == len(seq): return uniform(seq, lmdks, epsilon) # Find worst case regural point p_sel = 0 for p in seq: if not np.any(lmdks[:] == p): p_sel = p break # Sum all landmark timestamps with the worst regular sum_cur = p_sel + np.sum(lmdks) # epsilon_t = a*t + b b = sp.symbols('b', real=True) # Cumulative allocation at landmarks and one extra point, i.e., L union {t} T = seq.max() # Number of points to take into account k = len(lmdks) + 1 # if len(lmdks) == len(seq): # k = len(lmdks) # Bounding the privacy budgets at L union {t} # epsilon = a*sum(L union {t}) + |L union {t}|*b b = sp.solve(b + (((epsilon/len(seq) - b)/T)*sum_cur - epsilon)/k, simplify=False, rational=False)[0] # Allocate to the last point T epsilon/len(seq), i.e., user-level a = (epsilon/len(seq) - b)/T # Allocate the budget bgts = np.zeros([len(seq)]) for i, t in enumerate(seq): bgts[i] = a*t + b return bgts def plot_line(x_i, arr, label, marker, line): plt.plot(x_i, arr, label=label, color='black', marker=marker, linestyle=line, # linewidth=1.0, markerfacecolor='none') def stepped(seq, lmdks, epsilon): ''' Stepped budget allocation. Parameters: seq - The point sequence. lmdks - The landmarks. epsilon - The available privacy budget. Returns: bgts - The privacy budget allocation. ''' bgts = np.zeros([len(seq)]) epsilon_avail = epsilon/2 for i, p in enumerate(seq): # Reduce the available budget when a landmark is reached if np.any(lmdks[:] == p): epsilon_avail = epsilon_avail/2 bgts[i] = epsilon_avail return bgts def adaptive(seq, lmdks, epsilon, inc_rt, dec_rt): ''' Adaptive budget allocation. Parameters: seq - The point sequence. lmdks - The landmarks. epsilon - The available privacy budget. inc_rt - Sampling rate increase rate. dec_rt - Sampling rate decrease rate. Returns: rls_data - The perturbed data. bgts - The privacy budget allocation. skipped - The number of skipped releases. ''' # Uniform budget allocation bgts = uniform(seq, lmdks, epsilon) # Released rls_data = [None]*len(seq) # The sampling rate samp_rt = 1 # Track landmarks lmdk_cur = 0 # Track skipped releases skipped = 0 for i, p in enumerate(seq): # Check if current point is a landmark if lmdk_lib.is_landmark(p, lmdks): lmdk_cur += 1 # Get coordinates loc = (p[1], p[2]) if lmdk_lib.should_sample(samp_rt) or i == 0: # Add noise to original data new_loc = lmdk_lib.add_polar_noise(loc, bgts[i]) rls_data[i] = [p[0], new_loc[0], new_loc[1], p[3]] # Adjust sampling rate if i > 0: if distance((rls_data[i - 1][1], rls_data[i - 1][2]), new_loc).km*1000 < 1/bgts[i]: # Decrease samp_rt -= samp_rt*dec_rt else: # Increase samp_rt += (1 - samp_rt)*inc_rt else: skipped += 1 # Skip current release and approximate with previous rls_data[i] = rls_data[i - 1] if lmdk_lib.is_landmark(p, lmdks): # Allocate the current budget to the following releases uniformly for j in range(i + 1, len(seq)): bgts[j] += bgts[i]/(len(lmdks) - lmdk_cur + 1) # No budget was spent bgts[i] = 0 return rls_data, bgts, skipped def adaptive_cont(seq, lmdks, epsilon, inc_rt, dec_rt): ''' Adaptive budget allocation. Parameters: seq - The point sequence. lmdks - The landmarks. epsilon - The available privacy budget. inc_rt - Sampling rate increase rate. dec_rt - Sampling rate decrease rate. Returns: rls_data - The perturbed data. bgts - The privacy budget allocation. skipped - The number of skipped releases. ''' # Uniform budget allocation bgts = uniform(seq, lmdks, epsilon) # Released rls_data = [None]*len(seq) # The sampling rate samp_rt = 1 # Track landmarks lmdk_cur = 0 # Track skipped releases skipped = 0 for i, p in enumerate(seq): # Check if current point is a landmark r = any((lmdks[:]==p).all(1)) if r: lmdk_cur += 1 if lmdk_lib.should_sample(samp_rt) or i == 0: # Add noise to original data o = lmdk_lib.randomized_response(r, bgts[i]) rls_data[i] = [r, o] # Adjust sampling rate if i > 0: if rls_data[i - 1][1] == o: # Decrease samp_rt -= samp_rt*dec_rt else: # Increase samp_rt += (1 - samp_rt)*inc_rt else: skipped += 1 # Skip current release and approximate with previous rls_data[i] = rls_data[i - 1] if r: # Allocate the current budget to the following releases uniformly for j in range(i + 1, len(seq)): bgts[j] += bgts[i]/(len(lmdks) - lmdk_cur + 1) # No budget was spent bgts[i] = 0 return rls_data, bgts, skipped def skip(seq, lmdks, epsilon): ''' Skip landmarks. Parameters: seq - The point sequence. lmdks - The landmarks. epsilon - The available privacy budget. Returns: rls_data - The perturbed data. bgts - The privacy budget allocation. ''' # Event-level budget allocation bgts = np.array(len(seq)*[epsilon]) # Released rls_data = [None]*len(seq) for i, p in enumerate(seq): # Get coordinates loc = (p[1], p[2]) # Add noise to original data new_loc = lmdk_lib.add_polar_noise(loc, bgts[i]) # Check if current point is a landmark if lmdk_lib.is_landmark(p, lmdks): if i > 0: # Approximate with previous location new_loc = (rls_data[i - 1][1], rls_data[i - 1][2]) bgts[i] = 0 rls_data[i] = [p[0], new_loc[0], new_loc[1], p[3]] return rls_data, bgts def skip_cont(seq, lmdks, epsilon): ''' Skip landmarks. Parameters: seq - The point sequence. lmdks - The landmarks. epsilon - The available privacy budget. Returns: rls_data - The perturbed data. bgts - The privacy budget allocation. ''' # Event-level budget allocation bgts = np.array(len(seq)*[epsilon]) # Released rls_data = [None]*len(seq) for i, p in enumerate(seq): # Check if current point is a landmark r = any((lmdks[:]==p).all(1)) # Add noise o = lmdk_lib.randomized_response(r, bgts[i]) if r: if i > 0: # Approximate with previous o = rls_data[i - 1][1] bgts[i] = 0 rls_data[i] = [r, o] return rls_data, bgts def sample(seq, lmdks, epsilon): ''' Publish randomly. Parameters: seq - The point sequence. lmdks - The landmarks. epsilon - The available privacy budget. Returns: rls_data - The perturbed data. bgts - The privacy budget allocation. skipped - The number of skipped releases. ''' # Uniform budget allocation bgts = uniform(seq, lmdks, epsilon) # Released rls_data = [None]*len(seq) # The sampling rate # (publish with 50% chance) samp_rt = .5 # Track landmarks lmdk_cur = 0 # Track skipped releases skipped = 0 for i, p in enumerate(seq): # Check if current point is a landmark if lmdk_lib.is_landmark(p, lmdks): lmdk_cur += 1 # Get coordinates loc = (p[1], p[2]) if i == 0 or lmdk_lib.should_sample(samp_rt): # Add noise to original data new_loc = lmdk_lib.add_polar_noise(loc, bgts[i]) rls_data[i] = [p[0], new_loc[0], new_loc[1], p[3]] else: skipped += 1 # Skip current release and approximate with previous rls_data[i] = rls_data[i - 1] if lmdk_lib.is_landmark(p, lmdks): # Allocate the current budget to the following releases uniformly for j in range(i + 1, len(seq)): bgts[j] += bgts[i]/(len(lmdks) - lmdk_cur + 1) # No budget was spent bgts[i] = 0 return rls_data, bgts, skipped def discount(seq, lmdks, epsilon): ''' Temporally discounted sampling. Parameters: seq - The point sequence. lmdks - The landmarks. epsilon - The available privacy budget. Returns: rls_data - The perturbed data. bgts - The privacy budget allocation. skipped - The number of skipped releases. ''' # Uniform budget allocation bgts = uniform(seq, lmdks, epsilon) # Released rls_data = [None]*len(seq) # Track landmarks lmdk_cur = 0 # Track skipped releases skipped = 0 # Initialize the sampling rate samp_rt = .5 for i, p in enumerate(seq): # Check if current point is a landmark if lmdk_lib.is_landmark(p, lmdks): lmdk_cur += 1 # Adjust the sampling rate if len(lmdks) > 1: samp_rt = lmdks[lmdk_cur - 1][3] / lmdks[len(lmdks) - 1][3] # Get coordinates loc = (p[1], p[2]) if i == 0 or lmdk_lib.should_sample(samp_rt): # Add noise to original data new_loc = lmdk_lib.add_polar_noise(loc, bgts[i]) rls_data[i] = [p[0], new_loc[0], new_loc[1], p[3]] else: skipped += 1 # Skip current release and approximate with previous rls_data[i] = rls_data[i - 1] if lmdk_lib.is_landmark(p, lmdks): # Allocate the current budget to the following releases uniformly for j in range(i + 1, len(seq)): bgts[j] += bgts[i]/(len(lmdks) - lmdk_cur + 1) # No budget was spent bgts[i] = 0 return rls_data, bgts, skipped def incremental(seq, lmdks, epsilon, diff_rt): ''' Incremental sampling. Parameters: seq - The point sequence. lmdks - The landmarks. epsilon - The available privacy budget. diff_rt - Sampling rate difference. Returns: rls_data - The perturbed data. bgts - The privacy budget allocation. skipped - The number of skipped releases. ''' # Uniform budget allocation bgts = uniform(seq, lmdks, epsilon) # Released rls_data = [None]*len(seq) # The sampling rate # (publish with 50% chance) samp_rt = .5 # Track landmarks lmdk_cur = 0 # Track consecutive landmarks lmdk_con = 0 # Track skipped releases skipped = 0 for i, p in enumerate(seq): # Check if current point is a landmark if lmdk_lib.is_landmark(p, lmdks): lmdk_cur += 1 # Increase sampling rate samp_rt += (1 - samp_rt)*diff_rt else: # Decrease sampling rate samp_rt -= samp_rt*diff_rt # Get coordinates loc = (p[1], p[2]) if i == 0 or lmdk_lib.should_sample(samp_rt): # Add noise to original data new_loc = lmdk_lib.add_polar_noise(loc, bgts[i]) rls_data[i] = [p[0], new_loc[0], new_loc[1], p[3]] # Adjust sampling rate if lmdk_lib.is_landmark(p, lmdks): # Adjust consecutive landmarks tracking lmdk_con += 1 else: skipped += 1 # Skip current release and approximate with previous rls_data[i] = rls_data[i - 1] if lmdk_lib.is_landmark(p, lmdks): # Reset consecutive landmarks tracking lmdk_con = 0 # Allocate the current budget to the following releases uniformly for j in range(i + 1, len(seq)): bgts[j] += bgts[i]/(len(lmdks) - lmdk_cur + 1) # No budget was spent bgts[i] = 0 return rls_data, bgts, skipped def uniform_r(seq, lmdks, epsilon): # Released rls_data = [None]*len(seq) # Budgets bgts = uniform(seq, lmdks, epsilon) for i, p in enumerate(seq): # Get coordinates loc = (p[1], p[2]) # Add noise to original data new_loc = lmdk_lib.add_polar_noise(loc, bgts[i]) rls_data[i] = [p[0], new_loc[0], new_loc[1], p[3]] return rls_data, bgts def uniform_cont(seq, lmdks, epsilon): # Released rls_data = [None]*len(seq) # Budgets bgts = uniform(seq, lmdks, epsilon) for i, p in enumerate(seq): r = any((lmdks[:]==p).all(1)) # [original, perturbed] rls_data[i] = [r, lmdk_lib.randomized_response(r, bgts[i])] return rls_data, bgts def utility_analysis(seq, lmdks, o, epsilon): ''' Analyze the utility. Parameters: seq - The point sequence. lmdks - The landmarks. o - The perturbed data. epsilon - The available privacy budget. Returns: Nothing. ''' # Estimate Mean Absolute Error mae = 0 # Compare with uniform mae_u = 0 bgts_u = uniform(seq, lmdks, epsilon) for i, p in enumerate(seq): mae += distance((p[1], p[2]), (o[i][1], o[i][2])).km*1000/len(seq) new_loc = lmdk_lib.add_polar_noise((p[1], p[2]), bgts_u[i]) mae_u += distance((p[1], p[2]), (new_loc[0], new_loc[1])).km*1000/len(seq) print( '\n########## Analysis ##########\n' 'MAE uniform : %.2fm\n' 'MAE current : %.2fm\n' 'Difference : %.2f%%\n' '##############################\n' %(mae_u, mae, 100*(mae - mae_u)/mae_u) ) def mae(seq, o): mae = 0 for i, p in enumerate(seq): mae += distance((p[1], p[2]), (o[i][1], o[i][2])).km*1000/len(seq) return mae def mae_cont(o): mae = 0 for i, p in enumerate(o): # Check if original differs from final if p[0] != p[1]: mae += 1/len(o) return mae