the-last-thing/code/lib/lmdk_bgt.py

582 lines
16 KiB
Python
Raw Normal View History

2021-07-25 15:51:51 +02:00
#!/usr/bin/env python3
2021-07-25 17:40:54 +02:00
from datetime import datetime
2021-07-25 15:51:51 +02:00
from geopy.distance import distance
import math
import numpy as np
import random
import sympy as sp
import time
import lmdk_lib
from matplotlib import pyplot as plt
def draw_bgts(title, bgts):
'''
Plots the allocated budget.
Parameters:
title - The title of the plot.
bgts - The allocated privacy budget.
Returns:
Nothing.
'''
lmdk_lib.plot_init()
p = np.arange(1, len(bgts) + 1, 1)
plt.plot(
p,
bgts,
linewidth=lmdk_lib.line_width,
markersize=lmdk_lib.marker_size,
markeredgewidth=0,
label=r'$\varepsilon$',
marker='s'
)
lmdk_lib.plot_legend()
# Set plot box
plt.axis([1, len(bgts), .0, max(bgts) + max(bgts)/2])
# Set x axis label
plt.xlabel('Timestamp')
# Set y axis label
plt.ylabel('Privacy budget')
plt.title(title.replace('_', '-'))
plt.show()
def validate_bgts(seq, lmdks, epsilon, bgts):
'''
Budget allocation validation.
Parameters:
seq - The point sequence.
lmdks - The landmarks.
epsilon - The available privacy budget.
bgts - The privacy budget allocation.
Returns:
The index of the privacy budget that caused the failure or -1 if successful.
'''
bgts_sum = .0
# Landmarks
for i, p in enumerate(seq):
if np.any(lmdks[:] == p):
bgts_sum += bgts[i]
# Regular events
bgts_max = bgts_sum
for i, p in enumerate(seq):
if not np.any(lmdks[:] == p):
bgts_cur = bgts_sum + bgts[i]
if bgts_cur > bgts_max:
bgts_max = bgts_cur
if bgts_cur > epsilon and not math.isclose(bgts_cur, epsilon, rel_tol=.001):
print('Budget allocation validation failed: %.2f%% (%.4f/%.4f)' %(100*bgts_max/epsilon, bgts_max, epsilon))
return i
print(
'Landmark budget allocation: %.2f%% (%.4f/%.4f)\n'
'Overall budget allocation : %.2f%% (%.4f/%.4f)\n'
%(100*bgts_max/epsilon, bgts_max, epsilon,
100*sum(bgts)/epsilon, sum(bgts), epsilon,))
return -1
def uniform(seq, lmdks, epsilon):
'''
Uniform budget allocation.
Parameters:
seq - The point sequence.
lmdks - The landmarks.
epsilon - The available privacy budget.
Returns:
bgts - The privacy budget allocation.
'''
bgts = np.zeros(len(seq))
# Allocate enough budget for all landmarks and one regular point
k = len(lmdks) + 1
# All points are landmarks
if k > len(seq):
k = len(lmdks)
# Allocate the budget
for i, _ in enumerate(bgts):
bgts[i] = epsilon/k
return bgts
def geometric(seq, lmdks, epsilon):
'''
Geometric budget allocation.
Parameters:
seq - The point sequence.
lmdks - The landmarks.
epsilon - The available privacy budget.
Returns:
bgts - The privacy budget distribution.
'''
bgts = np.zeros([len(seq)])
epsilon_avail = epsilon
# Landmarks
for i, p in enumerate(seq):
if np.any(lmdks[:] == p):
bgts[i] = epsilon_avail/2
epsilon_avail = epsilon_avail - epsilon_avail/2
# Regular events
for i, p in enumerate(seq):
if not np.any(lmdks[:] == p):
bgts[i] = epsilon_avail
return bgts
def exponential(seq, lmdks, epsilon):
'''
Exponential uniform budget allocation (max to user-level).
Parameters:
seq - The point sequence.
lmdks - The landmarks.
epsilon - The available privacy budget.
Returns:
bgts - The privacy budget allocation.
'''
# Fallback to uniform if zero or max landmarks
if len(lmdks) == 0 or len(lmdks) == len(seq):
return uniform(seq, lmdks, epsilon)
# # In case of seq == lmdks
# l = 0
# N = epsilon/len(seq)
# Otherwise
bgts = np.zeros([len(seq)])
if len(seq) != len(lmdks):
# Find worst case regural point
p_sel = 0
for p in seq:
if not np.any(lmdks[:] == p):
p_sel = p
break
# List all landmark timestamps with the worst regular
points = np.append(lmdks, [p_sel])
points = np.sort(points)
# epsilon_t = N*e^-l*t
l = sp.symbols('l', real=True)
# Cumulative allocation at landmarks and one extra point, i.e., L union {t}
T = seq.max()
# Bounding the privacy budgets at L union {t}
# epsilon = sum(N*e^-l*t) for t in L union {t}
t = sp.symbols('t')
eq = 0
for t in points:
eq += (((epsilon/len(seq))/(sp.exp(-1*l*T)))*sp.exp(-1*l*t))
try:
l = sp.solve(eq - epsilon, simplify=False, rational=False)[0]
# l = sp.solveset(eq <= epsilon, l, sp.S.Reals).sup
except Exception:
return bgts
# Allocate to the last point T epsilon/len(seq), i.e., user-level
N = (epsilon/len(seq))/sp.exp(-1*l*T)
# Allocate the budget
for i, t in enumerate(seq):
bgts[i] = N*sp.exp(-1*l*t)
return bgts
def linear_zero(seq, lmdks, epsilon):
'''
Linear uniform budget allocation (max to zero).
Parameters:
seq - The point sequence.
lmdks - The landmarks.
epsilon - The available privacy budget.
Returns:
bgts - The privacy budget allocation.
'''
bgts = np.zeros([len(seq)])
# Find worst case regural point
p_sel = 0
for p in seq:
if not np.any(lmdks[:] == p):
p_sel = p
break
# Sum all landmark timestamps with the worst regular
sum_cur = p_sel
for p in lmdks:
sum_cur += p
# epsilon_t = a*t + b
b = sp.symbols('b')
# Cumulative allocation at landmarks and one extra point
T = seq[len(seq) - 1]
b = sp.solve(b - ((b/T)*sum_cur + epsilon)/(len(lmdks) + 1))[0]
# Allocate to the last point 0
a = -b/T
# Allocate the budget
for i, t in enumerate(seq):
bgts[i] = a*t + b
return bgts
def linear(seq, lmdks, epsilon):
'''
Linear uniform budget allocation (max to user-level).
Parameters:
seq - The point sequence.
lmdks - The landmarks.
epsilon - The available privacy budget.
Returns:
bgts - The privacy budget allocation.
'''
# Fallback to uniform if zero or max landmarks
if len(lmdks) == 0 or len(lmdks) == len(seq):
return uniform(seq, lmdks, epsilon)
# Find worst case regural point
p_sel = 0
for p in seq:
if not np.any(lmdks[:] == p):
p_sel = p
break
# Sum all landmark timestamps with the worst regular
sum_cur = p_sel + np.sum(lmdks)
# epsilon_t = a*t + b
b = sp.symbols('b', real=True)
# Cumulative allocation at landmarks and one extra point, i.e., L union {t}
T = seq.max()
# Number of points to take into account
k = len(lmdks) + 1
# if len(lmdks) == len(seq):
# k = len(lmdks)
# Bounding the privacy budgets at L union {t}
# epsilon = a*sum(L union {t}) + |L union {t}|*b
b = sp.solve(b + (((epsilon/len(seq) - b)/T)*sum_cur - epsilon)/k, simplify=False, rational=False)[0]
# Allocate to the last point T epsilon/len(seq), i.e., user-level
a = (epsilon/len(seq) - b)/T
# Allocate the budget
bgts = np.zeros([len(seq)])
for i, t in enumerate(seq):
bgts[i] = a*t + b
return bgts
def plot_line(x_i, arr, label, marker, line):
plt.plot(x_i,
arr,
label=label,
color='black',
marker=marker,
linestyle=line,
# linewidth=1.0,
markerfacecolor='none')
def stepped(seq, lmdks, epsilon):
'''
Stepped budget allocation.
Parameters:
seq - The point sequence.
lmdks - The landmarks.
epsilon - The available privacy budget.
Returns:
bgts - The privacy budget allocation.
'''
bgts = np.zeros([len(seq)])
epsilon_avail = epsilon/2
for i, p in enumerate(seq):
# Reduce the available budget when a landmark is reached
if np.any(lmdks[:] == p):
epsilon_avail = epsilon_avail/2
bgts[i] = epsilon_avail
return bgts
def adaptive(seq, lmdks, epsilon, inc_rt, dec_rt):
2021-07-25 15:51:51 +02:00
'''
Adaptive budget allocation.
Parameters:
seq - The point sequence.
lmdks - The landmarks.
epsilon - The available privacy budget.
inc_rt - Sampling rate increase rate.
dec_rt - Sampling rate decrease rate.
2021-07-25 15:51:51 +02:00
Returns:
rls_data - The perturbed data.
bgts - The privacy budget allocation.
skipped - The number of skipped releases.
'''
# Uniform budget allocation
bgts = uniform(seq, lmdks, epsilon)
# Released
rls_data = [None]*len(seq)
# The sampling rate
samp_rt = 1
# Track landmarks
lmdk_cur = 0
# Track skipped releases
skipped = 0
for i, p in enumerate(seq):
# Check if current point is a landmark
if lmdk_lib.is_landmark(p, lmdks):
lmdk_cur += 1
# Get coordinates
loc = (p[1], p[2])
if lmdk_lib.should_sample(samp_rt) or i == 0:
# Add noise to original data
new_loc = lmdk_lib.add_polar_noise(loc, bgts[i])
rls_data[i] = [p[0], new_loc[0], new_loc[1], p[3]]
# Adjust sampling rate
if i > 0:
if distance((rls_data[i - 1][1], rls_data[i - 1][2]), new_loc).km*1000 < 1/bgts[i]:
# Decrease
samp_rt -= samp_rt*dec_rt
2021-07-25 15:51:51 +02:00
else:
# Increase
samp_rt += (1 - samp_rt)*inc_rt
2021-07-25 15:51:51 +02:00
else:
skipped += 1
# Skip current release and approximate with previous
rls_data[i] = rls_data[i - 1]
if lmdk_lib.is_landmark(p, lmdks):
# Allocate the current budget to the following releases uniformly
for j in range(i + 1, len(seq)):
bgts[j] += bgts[i]/(len(lmdks) - lmdk_cur + 1)
# No budget was spent
bgts[i] = 0
return rls_data, bgts, skipped
def skip(seq, lmdks, epsilon):
'''
Skip landmarks.
Parameters:
seq - The point sequence.
lmdks - The landmarks.
epsilon - The available privacy budget.
Returns:
rls_data - The perturbed data.
bgts - The privacy budget allocation.
'''
# Event-level budget allocation
bgts = np.array(len(seq)*[epsilon])
# Released
rls_data = [None]*len(seq)
for i, p in enumerate(seq):
# Get coordinates
loc = (p[1], p[2])
# Add noise to original data
new_loc = lmdk_lib.add_polar_noise(loc, bgts[i])
# Check if current point is a landmark
if lmdk_lib.is_landmark(p, lmdks):
if i > 0:
# Approximate with previous location
new_loc = (rls_data[i - 1][1], rls_data[i - 1][2])
bgts[i] = 0
rls_data[i] = [p[0], new_loc[0], new_loc[1], p[3]]
return rls_data, bgts
def sample(seq, lmdks, epsilon):
'''
Publish randomly.
Parameters:
seq - The point sequence.
lmdks - The landmarks.
epsilon - The available privacy budget.
Returns:
rls_data - The perturbed data.
bgts - The privacy budget allocation.
skipped - The number of skipped releases.
'''
# Uniform budget allocation
bgts = uniform(seq, lmdks, epsilon)
# Released
rls_data = [None]*len(seq)
# The sampling rate
# (publish with 50% chance)
samp_rt = .5
# Track landmarks
lmdk_cur = 0
# Track skipped releases
skipped = 0
for i, p in enumerate(seq):
# Check if current point is a landmark
if lmdk_lib.is_landmark(p, lmdks):
lmdk_cur += 1
2021-07-25 17:40:54 +02:00
# Get coordinates
loc = (p[1], p[2])
if i == 0 or lmdk_lib.should_sample(samp_rt):
# Add noise to original data
new_loc = lmdk_lib.add_polar_noise(loc, bgts[i])
rls_data[i] = [p[0], new_loc[0], new_loc[1], p[3]]
else:
skipped += 1
# Skip current release and approximate with previous
rls_data[i] = rls_data[i - 1]
if lmdk_lib.is_landmark(p, lmdks):
# Allocate the current budget to the following releases uniformly
for j in range(i + 1, len(seq)):
bgts[j] += bgts[i]/(len(lmdks) - lmdk_cur + 1)
# No budget was spent
bgts[i] = 0
return rls_data, bgts, skipped
def discount(seq, lmdks, epsilon):
'''
Temporally discounted sampling.
Parameters:
seq - The point sequence.
lmdks - The landmarks.
epsilon - The available privacy budget.
Returns:
rls_data - The perturbed data.
bgts - The privacy budget allocation.
skipped - The number of skipped releases.
'''
# Uniform budget allocation
bgts = uniform(seq, lmdks, epsilon)
# Released
rls_data = [None]*len(seq)
# Track landmarks
lmdk_cur = 0
# Track skipped releases
skipped = 0
for i, p in enumerate(seq):
# The sampling rate
samp_rt = datetime.fromtimestamp(int(lmdks[lmdk_cur][3]))/datetime.fromtimestamp(int(lmdks[len(lmdks) - 1][3]))
# Check if current point is a landmark
if lmdk_lib.is_landmark(p, lmdks):
lmdk_cur += 1
2021-07-25 15:51:51 +02:00
# Get coordinates
loc = (p[1], p[2])
if i == 0 or lmdk_lib.should_sample(samp_rt):
# Add noise to original data
new_loc = lmdk_lib.add_polar_noise(loc, bgts[i])
rls_data[i] = [p[0], new_loc[0], new_loc[1], p[3]]
else:
skipped += 1
# Skip current release and approximate with previous
rls_data[i] = rls_data[i - 1]
if lmdk_lib.is_landmark(p, lmdks):
# Allocate the current budget to the following releases uniformly
for j in range(i + 1, len(seq)):
bgts[j] += bgts[i]/(len(lmdks) - lmdk_cur + 1)
# No budget was spent
bgts[i] = 0
return rls_data, bgts, skipped
2021-07-26 16:22:02 +02:00
def incremental(seq, lmdks, epsilon, diff_rt):
'''
Incremental sampling.
Parameters:
seq - The point sequence.
lmdks - The landmarks.
epsilon - The available privacy budget.
diff_rt - Sampling rate difference.
Returns:
rls_data - The perturbed data.
bgts - The privacy budget allocation.
skipped - The number of skipped releases.
'''
# Uniform budget allocation
bgts = uniform(seq, lmdks, epsilon)
# Released
rls_data = [None]*len(seq)
# The sampling rate
# (publish with 50% chance)
samp_rt = .5
# Track landmarks
lmdk_cur = 0
# Track consecutive landmarks
lmdk_con = 0
# Track skipped releases
skipped = 0
for i, p in enumerate(seq):
# Check if current point is a landmark
if lmdk_lib.is_landmark(p, lmdks):
lmdk_cur += 1
# Increase sampling rate
samp_rt += (1 - samp_rt)*diff_rt
else:
# Decrease sampling rate
samp_rt -= samp_rt*diff_rt
# Get coordinates
loc = (p[1], p[2])
if i == 0 or lmdk_lib.should_sample(samp_rt):
# Add noise to original data
new_loc = lmdk_lib.add_polar_noise(loc, bgts[i])
rls_data[i] = [p[0], new_loc[0], new_loc[1], p[3]]
# Adjust sampling rate
if lmdk_lib.is_landmark(p, lmdks):
# Adjust consecutive landmarks tracking
lmdk_con += 1
else:
skipped += 1
# Skip current release and approximate with previous
rls_data[i] = rls_data[i - 1]
if lmdk_lib.is_landmark(p, lmdks):
# Reset consecutive landmarks tracking
lmdk_con = 0
# Allocate the current budget to the following releases uniformly
for j in range(i + 1, len(seq)):
bgts[j] += bgts[i]/(len(lmdks) - lmdk_cur + 1)
# No budget was spent
bgts[i] = 0
return rls_data, bgts, skipped
2021-07-25 15:51:51 +02:00
def uniform_r(seq, lmdks, epsilon):
# Released
rls_data = [None]*len(seq)
# Budgets
bgts = uniform(seq, lmdks, epsilon)
for i, p in enumerate(seq):
# Get coordinates
loc = (p[1], p[2])
# Add noise to original data
new_loc = lmdk_lib.add_polar_noise(loc, bgts[i])
rls_data[i] = [p[0], new_loc[0], new_loc[1], p[3]]
return rls_data, bgts
def utility_analysis(seq, lmdks, o, epsilon):
'''
Analyze the utility.
Parameters:
seq - The point sequence.
lmdks - The landmarks.
o - The perturbed data.
epsilon - The available privacy budget.
Returns:
Nothing.
'''
# Estimate Mean Absolute Error
mae = 0
# Compare with uniform
mae_u = 0
bgts_u = uniform(seq, lmdks, epsilon)
for i, p in enumerate(seq):
mae += distance((p[1], p[2]), (o[i][1], o[i][2])).km*1000/len(seq)
new_loc = lmdk_lib.add_polar_noise((p[1], p[2]), bgts_u[i])
mae_u += distance((p[1], p[2]), (new_loc[0], new_loc[1])).km*1000/len(seq)
print(
'\n########## Analysis ##########\n'
'MAE uniform : %.2fm\n'
'MAE current : %.2fm\n'
'Difference : %.2f%%\n'
'##############################\n'
%(mae_u, mae, 100*(mae - mae_u)/mae_u)
)
def mae(seq, o):
mae = 0
for i, p in enumerate(seq):
mae += distance((p[1], p[2]), (o[i][1], o[i][2])).km*1000/len(seq)
return mae