607 lines
16 KiB
Python
607 lines
16 KiB
Python
#!/usr/bin/env python3
|
|
|
|
from datetime import datetime
|
|
from geopy.distance import distance
|
|
import math
|
|
import numpy as np
|
|
import random
|
|
import sympy as sp
|
|
import time
|
|
import lmdk_lib
|
|
from matplotlib import pyplot as plt
|
|
|
|
|
|
def draw_bgts(title, bgts):
|
|
'''
|
|
Plots the allocated budget.
|
|
|
|
Parameters:
|
|
title - The title of the plot.
|
|
bgts - The allocated privacy budget.
|
|
Returns:
|
|
Nothing.
|
|
'''
|
|
lmdk_lib.plot_init()
|
|
p = np.arange(1, len(bgts) + 1, 1)
|
|
plt.plot(
|
|
p,
|
|
bgts,
|
|
linewidth=lmdk_lib.line_width,
|
|
markersize=lmdk_lib.marker_size,
|
|
markeredgewidth=0,
|
|
label=r'$\varepsilon$',
|
|
marker='s'
|
|
)
|
|
lmdk_lib.plot_legend()
|
|
# Set plot box
|
|
plt.axis([1, len(bgts), .0, max(bgts) + max(bgts)/2])
|
|
# Set x axis label
|
|
plt.xlabel('Timestamp')
|
|
# Set y axis label
|
|
plt.ylabel('Privacy budget')
|
|
plt.title(title.replace('_', '-'))
|
|
plt.show()
|
|
|
|
|
|
def validate_bgts(seq, lmdks, epsilon, bgts):
|
|
'''
|
|
Budget allocation validation.
|
|
|
|
Parameters:
|
|
seq - The point sequence.
|
|
lmdks - The landmarks.
|
|
epsilon - The available privacy budget.
|
|
bgts - The privacy budget allocation.
|
|
Returns:
|
|
The index of the privacy budget that caused the failure or -1 if successful.
|
|
'''
|
|
bgts_sum = .0
|
|
# Landmarks
|
|
for i, p in enumerate(seq):
|
|
if np.any(lmdks[:] == p):
|
|
bgts_sum += bgts[i]
|
|
# Regular events
|
|
bgts_max = bgts_sum
|
|
for i, p in enumerate(seq):
|
|
if not np.any(lmdks[:] == p):
|
|
bgts_cur = bgts_sum + bgts[i]
|
|
if bgts_cur > bgts_max:
|
|
bgts_max = bgts_cur
|
|
if bgts_cur > epsilon and not math.isclose(bgts_cur, epsilon, rel_tol=.001):
|
|
print('Budget allocation validation failed: %.2f%% (%.4f/%.4f)' %(100*bgts_max/epsilon, bgts_max, epsilon))
|
|
return i
|
|
print(
|
|
'Landmark budget allocation: %.2f%% (%.4f/%.4f)\n'
|
|
'Overall budget allocation : %.2f%% (%.4f/%.4f)\n'
|
|
%(100*bgts_max/epsilon, bgts_max, epsilon,
|
|
100*sum(bgts)/epsilon, sum(bgts), epsilon,))
|
|
return -1
|
|
|
|
|
|
def uniform(seq, lmdks, epsilon):
|
|
'''
|
|
Uniform budget allocation.
|
|
|
|
Parameters:
|
|
seq - The point sequence.
|
|
lmdks - The landmarks.
|
|
epsilon - The available privacy budget.
|
|
Returns:
|
|
bgts - The privacy budget allocation.
|
|
'''
|
|
bgts = np.zeros(len(seq))
|
|
# Allocate enough budget for all landmarks and one regular point
|
|
k = len(lmdks) + 1
|
|
# All points are landmarks
|
|
if k > len(seq):
|
|
k = len(lmdks)
|
|
# Allocate the budget
|
|
for i, _ in enumerate(bgts):
|
|
bgts[i] = epsilon/k
|
|
return bgts
|
|
|
|
|
|
def geometric(seq, lmdks, epsilon):
|
|
'''
|
|
Geometric budget allocation.
|
|
|
|
Parameters:
|
|
seq - The point sequence.
|
|
lmdks - The landmarks.
|
|
epsilon - The available privacy budget.
|
|
Returns:
|
|
bgts - The privacy budget distribution.
|
|
'''
|
|
bgts = np.zeros([len(seq)])
|
|
epsilon_avail = epsilon
|
|
# Landmarks
|
|
for i, p in enumerate(seq):
|
|
if np.any(lmdks[:] == p):
|
|
bgts[i] = epsilon_avail/2
|
|
epsilon_avail = epsilon_avail - epsilon_avail/2
|
|
# Regular events
|
|
for i, p in enumerate(seq):
|
|
if not np.any(lmdks[:] == p):
|
|
bgts[i] = epsilon_avail
|
|
return bgts
|
|
|
|
|
|
def exponential(seq, lmdks, epsilon):
|
|
'''
|
|
Exponential uniform budget allocation (max to user-level).
|
|
|
|
Parameters:
|
|
seq - The point sequence.
|
|
lmdks - The landmarks.
|
|
epsilon - The available privacy budget.
|
|
Returns:
|
|
bgts - The privacy budget allocation.
|
|
'''
|
|
# Fallback to uniform if zero or max landmarks
|
|
if len(lmdks) == 0 or len(lmdks) == len(seq):
|
|
return uniform(seq, lmdks, epsilon)
|
|
# # In case of seq == lmdks
|
|
# l = 0
|
|
# N = epsilon/len(seq)
|
|
# Otherwise
|
|
bgts = np.zeros([len(seq)])
|
|
if len(seq) != len(lmdks):
|
|
# Find worst case regural point
|
|
p_sel = 0
|
|
for p in seq:
|
|
if not np.any(lmdks[:] == p):
|
|
p_sel = p
|
|
break
|
|
# List all landmark timestamps with the worst regular
|
|
points = np.append(lmdks, [p_sel])
|
|
points = np.sort(points)
|
|
# epsilon_t = N*e^-l*t
|
|
l = sp.symbols('l', real=True)
|
|
# Cumulative allocation at landmarks and one extra point, i.e., L union {t}
|
|
T = seq.max()
|
|
# Bounding the privacy budgets at L union {t}
|
|
# epsilon = sum(N*e^-l*t) for t in L union {t}
|
|
t = sp.symbols('t')
|
|
eq = 0
|
|
for t in points:
|
|
eq += (((epsilon/len(seq))/(sp.exp(-1*l*T)))*sp.exp(-1*l*t))
|
|
try:
|
|
l = sp.solve(eq - epsilon, simplify=False, rational=False)[0]
|
|
# l = sp.solveset(eq <= epsilon, l, sp.S.Reals).sup
|
|
except Exception:
|
|
return bgts
|
|
# Allocate to the last point T epsilon/len(seq), i.e., user-level
|
|
N = (epsilon/len(seq))/sp.exp(-1*l*T)
|
|
# Allocate the budget
|
|
for i, t in enumerate(seq):
|
|
bgts[i] = N*sp.exp(-1*l*t)
|
|
return bgts
|
|
|
|
|
|
def linear_zero(seq, lmdks, epsilon):
|
|
'''
|
|
Linear uniform budget allocation (max to zero).
|
|
|
|
Parameters:
|
|
seq - The point sequence.
|
|
lmdks - The landmarks.
|
|
epsilon - The available privacy budget.
|
|
Returns:
|
|
bgts - The privacy budget allocation.
|
|
'''
|
|
bgts = np.zeros([len(seq)])
|
|
# Find worst case regural point
|
|
p_sel = 0
|
|
for p in seq:
|
|
if not np.any(lmdks[:] == p):
|
|
p_sel = p
|
|
break
|
|
# Sum all landmark timestamps with the worst regular
|
|
sum_cur = p_sel
|
|
for p in lmdks:
|
|
sum_cur += p
|
|
# epsilon_t = a*t + b
|
|
b = sp.symbols('b')
|
|
# Cumulative allocation at landmarks and one extra point
|
|
T = seq[len(seq) - 1]
|
|
b = sp.solve(b - ((b/T)*sum_cur + epsilon)/(len(lmdks) + 1))[0]
|
|
# Allocate to the last point 0
|
|
a = -b/T
|
|
# Allocate the budget
|
|
for i, t in enumerate(seq):
|
|
bgts[i] = a*t + b
|
|
return bgts
|
|
|
|
|
|
def linear(seq, lmdks, epsilon):
|
|
'''
|
|
Linear uniform budget allocation (max to user-level).
|
|
|
|
Parameters:
|
|
seq - The point sequence.
|
|
lmdks - The landmarks.
|
|
epsilon - The available privacy budget.
|
|
Returns:
|
|
bgts - The privacy budget allocation.
|
|
'''
|
|
# Fallback to uniform if zero or max landmarks
|
|
if len(lmdks) == 0 or len(lmdks) == len(seq):
|
|
return uniform(seq, lmdks, epsilon)
|
|
# Find worst case regural point
|
|
p_sel = 0
|
|
for p in seq:
|
|
if not np.any(lmdks[:] == p):
|
|
p_sel = p
|
|
break
|
|
# Sum all landmark timestamps with the worst regular
|
|
sum_cur = p_sel + np.sum(lmdks)
|
|
# epsilon_t = a*t + b
|
|
b = sp.symbols('b', real=True)
|
|
# Cumulative allocation at landmarks and one extra point, i.e., L union {t}
|
|
T = seq.max()
|
|
# Number of points to take into account
|
|
k = len(lmdks) + 1
|
|
# if len(lmdks) == len(seq):
|
|
# k = len(lmdks)
|
|
# Bounding the privacy budgets at L union {t}
|
|
# epsilon = a*sum(L union {t}) + |L union {t}|*b
|
|
b = sp.solve(b + (((epsilon/len(seq) - b)/T)*sum_cur - epsilon)/k, simplify=False, rational=False)[0]
|
|
# Allocate to the last point T epsilon/len(seq), i.e., user-level
|
|
a = (epsilon/len(seq) - b)/T
|
|
# Allocate the budget
|
|
bgts = np.zeros([len(seq)])
|
|
for i, t in enumerate(seq):
|
|
bgts[i] = a*t + b
|
|
return bgts
|
|
|
|
|
|
def plot_line(x_i, arr, label, marker, line):
|
|
plt.plot(x_i,
|
|
arr,
|
|
label=label,
|
|
color='black',
|
|
marker=marker,
|
|
linestyle=line,
|
|
# linewidth=1.0,
|
|
markerfacecolor='none')
|
|
|
|
|
|
def stepped(seq, lmdks, epsilon):
|
|
'''
|
|
Stepped budget allocation.
|
|
|
|
Parameters:
|
|
seq - The point sequence.
|
|
lmdks - The landmarks.
|
|
epsilon - The available privacy budget.
|
|
Returns:
|
|
bgts - The privacy budget allocation.
|
|
'''
|
|
bgts = np.zeros([len(seq)])
|
|
epsilon_avail = epsilon/2
|
|
for i, p in enumerate(seq):
|
|
# Reduce the available budget when a landmark is reached
|
|
if np.any(lmdks[:] == p):
|
|
epsilon_avail = epsilon_avail/2
|
|
bgts[i] = epsilon_avail
|
|
return bgts
|
|
|
|
|
|
def adaptive(seq, lmdks, epsilon, inc_rt, dec_rt):
|
|
'''
|
|
Adaptive budget allocation.
|
|
|
|
Parameters:
|
|
seq - The point sequence.
|
|
lmdks - The landmarks.
|
|
epsilon - The available privacy budget.
|
|
inc_rt - Sampling rate increase rate.
|
|
dec_rt - Sampling rate decrease rate.
|
|
Returns:
|
|
rls_data - The perturbed data.
|
|
bgts - The privacy budget allocation.
|
|
skipped - The number of skipped releases.
|
|
'''
|
|
# Uniform budget allocation
|
|
bgts = uniform(seq, lmdks, epsilon)
|
|
# Released
|
|
rls_data = [None]*len(seq)
|
|
# The sampling rate
|
|
samp_rt = 1
|
|
# Track landmarks
|
|
lmdk_cur = 0
|
|
# Track skipped releases
|
|
skipped = 0
|
|
for i, p in enumerate(seq):
|
|
# Check if current point is a landmark
|
|
if lmdk_lib.is_landmark(p, lmdks):
|
|
lmdk_cur += 1
|
|
# Get coordinates
|
|
loc = (p[1], p[2])
|
|
if lmdk_lib.should_sample(samp_rt) or i == 0:
|
|
# Add noise to original data
|
|
new_loc = lmdk_lib.add_polar_noise(loc, bgts[i])
|
|
rls_data[i] = [p[0], new_loc[0], new_loc[1], p[3]]
|
|
# Adjust sampling rate
|
|
if i > 0:
|
|
if distance((rls_data[i - 1][1], rls_data[i - 1][2]), new_loc).km*1000 < 1/bgts[i]:
|
|
# Decrease
|
|
samp_rt -= samp_rt*dec_rt
|
|
else:
|
|
# Increase
|
|
samp_rt += (1 - samp_rt)*inc_rt
|
|
else:
|
|
skipped += 1
|
|
# Skip current release and approximate with previous
|
|
rls_data[i] = rls_data[i - 1]
|
|
if lmdk_lib.is_landmark(p, lmdks):
|
|
# Allocate the current budget to the following releases uniformly
|
|
for j in range(i + 1, len(seq)):
|
|
bgts[j] += bgts[i]/(len(lmdks) - lmdk_cur + 1)
|
|
# No budget was spent
|
|
bgts[i] = 0
|
|
return rls_data, bgts, skipped
|
|
|
|
|
|
def skip(seq, lmdks, epsilon):
|
|
'''
|
|
Skip landmarks.
|
|
|
|
Parameters:
|
|
seq - The point sequence.
|
|
lmdks - The landmarks.
|
|
epsilon - The available privacy budget.
|
|
Returns:
|
|
rls_data - The perturbed data.
|
|
bgts - The privacy budget allocation.
|
|
'''
|
|
# Event-level budget allocation
|
|
bgts = np.array(len(seq)*[epsilon])
|
|
# Released
|
|
rls_data = [None]*len(seq)
|
|
for i, p in enumerate(seq):
|
|
# Get coordinates
|
|
loc = (p[1], p[2])
|
|
# Add noise to original data
|
|
new_loc = lmdk_lib.add_polar_noise(loc, bgts[i])
|
|
# Check if current point is a landmark
|
|
if lmdk_lib.is_landmark(p, lmdks):
|
|
if i > 0:
|
|
# Approximate with previous location
|
|
new_loc = (rls_data[i - 1][1], rls_data[i - 1][2])
|
|
bgts[i] = 0
|
|
rls_data[i] = [p[0], new_loc[0], new_loc[1], p[3]]
|
|
return rls_data, bgts
|
|
|
|
|
|
def sample(seq, lmdks, epsilon):
|
|
'''
|
|
Publish randomly.
|
|
|
|
Parameters:
|
|
seq - The point sequence.
|
|
lmdks - The landmarks.
|
|
epsilon - The available privacy budget.
|
|
Returns:
|
|
rls_data - The perturbed data.
|
|
bgts - The privacy budget allocation.
|
|
skipped - The number of skipped releases.
|
|
'''
|
|
# Uniform budget allocation
|
|
bgts = uniform(seq, lmdks, epsilon)
|
|
# Released
|
|
rls_data = [None]*len(seq)
|
|
# The sampling rate
|
|
# (publish with 50% chance)
|
|
samp_rt = .5
|
|
# Track landmarks
|
|
lmdk_cur = 0
|
|
# Track skipped releases
|
|
skipped = 0
|
|
for i, p in enumerate(seq):
|
|
# Check if current point is a landmark
|
|
if lmdk_lib.is_landmark(p, lmdks):
|
|
lmdk_cur += 1
|
|
# Get coordinates
|
|
loc = (p[1], p[2])
|
|
if i == 0 or lmdk_lib.should_sample(samp_rt):
|
|
# Add noise to original data
|
|
new_loc = lmdk_lib.add_polar_noise(loc, bgts[i])
|
|
rls_data[i] = [p[0], new_loc[0], new_loc[1], p[3]]
|
|
else:
|
|
skipped += 1
|
|
# Skip current release and approximate with previous
|
|
rls_data[i] = rls_data[i - 1]
|
|
if lmdk_lib.is_landmark(p, lmdks):
|
|
# Allocate the current budget to the following releases uniformly
|
|
for j in range(i + 1, len(seq)):
|
|
bgts[j] += bgts[i]/(len(lmdks) - lmdk_cur + 1)
|
|
# No budget was spent
|
|
bgts[i] = 0
|
|
return rls_data, bgts, skipped
|
|
|
|
|
|
def discount(seq, lmdks, epsilon):
|
|
'''
|
|
Temporally discounted sampling.
|
|
|
|
Parameters:
|
|
seq - The point sequence.
|
|
lmdks - The landmarks.
|
|
epsilon - The available privacy budget.
|
|
Returns:
|
|
rls_data - The perturbed data.
|
|
bgts - The privacy budget allocation.
|
|
skipped - The number of skipped releases.
|
|
'''
|
|
# Uniform budget allocation
|
|
bgts = uniform(seq, lmdks, epsilon)
|
|
# Released
|
|
rls_data = [None]*len(seq)
|
|
# Track landmarks
|
|
lmdk_cur = 0
|
|
# Track skipped releases
|
|
skipped = 0
|
|
# Initialize the sampling rate
|
|
samp_rt = .5
|
|
for i, p in enumerate(seq):
|
|
# Check if current point is a landmark
|
|
if lmdk_lib.is_landmark(p, lmdks):
|
|
lmdk_cur += 1
|
|
# Adjust the sampling rate
|
|
if len(lmdks) > 1:
|
|
samp_rt = lmdks[lmdk_cur - 1][3] / lmdks[len(lmdks) - 1][3]
|
|
# Get coordinates
|
|
loc = (p[1], p[2])
|
|
if i == 0 or lmdk_lib.should_sample(samp_rt):
|
|
# Add noise to original data
|
|
new_loc = lmdk_lib.add_polar_noise(loc, bgts[i])
|
|
rls_data[i] = [p[0], new_loc[0], new_loc[1], p[3]]
|
|
else:
|
|
skipped += 1
|
|
# Skip current release and approximate with previous
|
|
rls_data[i] = rls_data[i - 1]
|
|
if lmdk_lib.is_landmark(p, lmdks):
|
|
# Allocate the current budget to the following releases uniformly
|
|
for j in range(i + 1, len(seq)):
|
|
bgts[j] += bgts[i]/(len(lmdks) - lmdk_cur + 1)
|
|
# No budget was spent
|
|
bgts[i] = 0
|
|
return rls_data, bgts, skipped
|
|
|
|
|
|
def incremental(seq, lmdks, epsilon, diff_rt):
|
|
'''
|
|
Incremental sampling.
|
|
|
|
Parameters:
|
|
seq - The point sequence.
|
|
lmdks - The landmarks.
|
|
epsilon - The available privacy budget.
|
|
diff_rt - Sampling rate difference.
|
|
Returns:
|
|
rls_data - The perturbed data.
|
|
bgts - The privacy budget allocation.
|
|
skipped - The number of skipped releases.
|
|
'''
|
|
# Uniform budget allocation
|
|
bgts = uniform(seq, lmdks, epsilon)
|
|
# Released
|
|
rls_data = [None]*len(seq)
|
|
# The sampling rate
|
|
# (publish with 50% chance)
|
|
samp_rt = .5
|
|
# Track landmarks
|
|
lmdk_cur = 0
|
|
# Track consecutive landmarks
|
|
lmdk_con = 0
|
|
# Track skipped releases
|
|
skipped = 0
|
|
for i, p in enumerate(seq):
|
|
# Check if current point is a landmark
|
|
if lmdk_lib.is_landmark(p, lmdks):
|
|
lmdk_cur += 1
|
|
# Increase sampling rate
|
|
samp_rt += (1 - samp_rt)*diff_rt
|
|
else:
|
|
# Decrease sampling rate
|
|
samp_rt -= samp_rt*diff_rt
|
|
# Get coordinates
|
|
loc = (p[1], p[2])
|
|
if i == 0 or lmdk_lib.should_sample(samp_rt):
|
|
# Add noise to original data
|
|
new_loc = lmdk_lib.add_polar_noise(loc, bgts[i])
|
|
rls_data[i] = [p[0], new_loc[0], new_loc[1], p[3]]
|
|
# Adjust sampling rate
|
|
if lmdk_lib.is_landmark(p, lmdks):
|
|
# Adjust consecutive landmarks tracking
|
|
lmdk_con += 1
|
|
else:
|
|
skipped += 1
|
|
# Skip current release and approximate with previous
|
|
rls_data[i] = rls_data[i - 1]
|
|
if lmdk_lib.is_landmark(p, lmdks):
|
|
# Reset consecutive landmarks tracking
|
|
lmdk_con = 0
|
|
# Allocate the current budget to the following releases uniformly
|
|
for j in range(i + 1, len(seq)):
|
|
bgts[j] += bgts[i]/(len(lmdks) - lmdk_cur + 1)
|
|
# No budget was spent
|
|
bgts[i] = 0
|
|
return rls_data, bgts, skipped
|
|
|
|
|
|
def uniform_r(seq, lmdks, epsilon):
|
|
# Released
|
|
rls_data = [None]*len(seq)
|
|
# Budgets
|
|
bgts = uniform(seq, lmdks, epsilon)
|
|
for i, p in enumerate(seq):
|
|
# Get coordinates
|
|
loc = (p[1], p[2])
|
|
# Add noise to original data
|
|
new_loc = lmdk_lib.add_polar_noise(loc, bgts[i])
|
|
rls_data[i] = [p[0], new_loc[0], new_loc[1], p[3]]
|
|
return rls_data, bgts
|
|
|
|
|
|
def uniform_cont(seq, lmdks, epsilon):
|
|
# Released
|
|
rls_data = [None]*len(seq)
|
|
# Budgets
|
|
bgts = uniform(seq, lmdks, epsilon)
|
|
for i, p in enumerate(seq):
|
|
r = p[2] in lmdks
|
|
# [original, perturbed]
|
|
rls_data[i] = [r, lmdk_lib.randomized_response(r, bgts[i])]
|
|
return rls_data, bgts
|
|
|
|
|
|
def utility_analysis(seq, lmdks, o, epsilon):
|
|
'''
|
|
Analyze the utility.
|
|
|
|
Parameters:
|
|
seq - The point sequence.
|
|
lmdks - The landmarks.
|
|
o - The perturbed data.
|
|
epsilon - The available privacy budget.
|
|
Returns:
|
|
Nothing.
|
|
'''
|
|
# Estimate Mean Absolute Error
|
|
mae = 0
|
|
# Compare with uniform
|
|
mae_u = 0
|
|
bgts_u = uniform(seq, lmdks, epsilon)
|
|
for i, p in enumerate(seq):
|
|
mae += distance((p[1], p[2]), (o[i][1], o[i][2])).km*1000/len(seq)
|
|
new_loc = lmdk_lib.add_polar_noise((p[1], p[2]), bgts_u[i])
|
|
mae_u += distance((p[1], p[2]), (new_loc[0], new_loc[1])).km*1000/len(seq)
|
|
|
|
print(
|
|
'\n########## Analysis ##########\n'
|
|
'MAE uniform : %.2fm\n'
|
|
'MAE current : %.2fm\n'
|
|
'Difference : %.2f%%\n'
|
|
'##############################\n'
|
|
%(mae_u, mae, 100*(mae - mae_u)/mae_u)
|
|
)
|
|
|
|
|
|
def mae(seq, o):
|
|
mae = 0
|
|
for i, p in enumerate(seq):
|
|
mae += distance((p[1], p[2]), (o[i][1], o[i][2])).km*1000/len(seq)
|
|
return mae
|
|
|
|
|
|
def mae_cont(o):
|
|
mae = 0
|
|
for i, p in enumerate(o):
|
|
# Check if original differs from final
|
|
if p[0] != p[1]:
|
|
mae += 1/len(o)
|
|
print(mae)
|
|
return mae
|