537 lines
15 KiB
Python
537 lines
15 KiB
Python
#!/usr/bin/env python3
|
|
|
|
import itertools
|
|
from lmdk_lib import *
|
|
import exp_mech
|
|
import numpy as np
|
|
import random
|
|
import scipy.stats as stats
|
|
import time
|
|
|
|
|
|
'''
|
|
Print all the points.
|
|
|
|
Parameters:
|
|
seq - The point sequence.
|
|
combs - All the possible point combinations for a specified size.
|
|
lmdks - The landmarks.
|
|
Returns:
|
|
Nothing.
|
|
'''
|
|
def print_rslt(seq, combs, lmdks):
|
|
eval_sum = .0
|
|
for idx, c in enumerate(combs):
|
|
rslt = str(idx + 1) + ':\t'
|
|
for i in seq:
|
|
# Selected
|
|
if i in c:
|
|
rslt += '(' + str(i) + ')\t'
|
|
# Landmark
|
|
elif i in lmdks:
|
|
rslt += '*' + str(i) + '*\t'
|
|
# Not selected
|
|
else:
|
|
rslt += ' ' + str(i) + ' \t'
|
|
dists = get_rel_dists(seq, list(c), lmdks)
|
|
eval_orig = eval_seq(get_rel_dists(seq, [], lmdks))
|
|
eval_cur = eval_seq(dists)
|
|
eval_sum += eval_cur
|
|
# print(rslt, '\t', dists, '\t', sum(dists), '\t', eval_cur)
|
|
print(rslt, eval_cur)
|
|
eval_avg = eval_sum/len(combs)
|
|
print('Average STD (difference with original): %.4f (%.2f%%)' %(eval_avg, 100*(eval_avg - eval_orig)/eval_orig))
|
|
|
|
|
|
'''
|
|
Print the difference with the original.
|
|
|
|
Parameters:
|
|
seq - The point sequence.
|
|
combs - All the possible point combinations for a specified size.
|
|
lmdks - The landmarks.
|
|
Returns:
|
|
The difference with the original.
|
|
'''
|
|
def print_diff(seq, combs, lmdks):
|
|
eval_sum = .0
|
|
for c in combs:
|
|
dists = get_rel_dists(seq, list(c), lmdks)
|
|
eval_orig = eval_seq(get_rel_dists(seq, [], lmdks))
|
|
eval_cur = eval_seq(dists)
|
|
eval_sum += eval_cur
|
|
eval_avg = eval_sum/len(combs)
|
|
diff = 100*(eval_avg - eval_orig)/eval_orig
|
|
print('Average STD (difference with original): %.4f (%.2f%%)' %(eval_avg, diff))
|
|
return diff
|
|
|
|
|
|
'''
|
|
Finds the optimal set of regular points.
|
|
|
|
Parameters:
|
|
seq - The point sequence.
|
|
lmdks - The landmarks.
|
|
Returns:
|
|
The optimal option.
|
|
|
|
Requirements:
|
|
n = Regular points
|
|
r = The size of a combination
|
|
Time - O(C(n, r) + 2^C(n, r))
|
|
Space - O(r*C(n, r))
|
|
'''
|
|
def get_opts_optim(seq, lmdks):
|
|
# Evaluate the original
|
|
eval_orig = eval_seq(get_rel_dists(seq, [], lmdks))
|
|
|
|
# Get all possible options
|
|
opts = get_opts(seq, lmdks)
|
|
|
|
# Evaluate options
|
|
# Track the minimum (best) evaluation
|
|
diff_min = float('inf')
|
|
|
|
# Track the optimal sequence (the one with the best evaluation)
|
|
optim = []
|
|
for opt in opts:
|
|
eval_sum = 0
|
|
for o in opt:
|
|
eval_sum += eval_seq(get_rel_dists(seq, o, lmdks))
|
|
# Compare with current optimal
|
|
diff_cur = abs(eval_sum/len(opt) - eval_orig)
|
|
if diff_cur < diff_min:
|
|
diff_min = diff_cur
|
|
optim = list(opt)
|
|
|
|
return optim
|
|
|
|
|
|
'''
|
|
Finds a set of regular points from top (less) to bottom (many)
|
|
(seems to perform better than the bottom-to-top approach).
|
|
|
|
Parameters:
|
|
seq - The point sequence.
|
|
lmdks - The landmarks.
|
|
Returns:
|
|
The resulting set of options.
|
|
|
|
Requirements:
|
|
For all possible sets of regular points n such that n = len(seq) - len(lmdks).
|
|
The result is a set of options for every possible value of n and for all possible combinations for each n.
|
|
Time - O(n^2)
|
|
Space - O(n^2)
|
|
'''
|
|
def get_opts_from_top(seq, lmdks):
|
|
# Evaluate the original
|
|
eval_orig = eval_seq(get_rel_dists(seq, [], lmdks))
|
|
|
|
opts = []
|
|
lmdks_cur = np.array(lmdks)
|
|
while not np.array_equal(lmdks_cur, seq):
|
|
# Find the combinations for one more point
|
|
reg = get_reg(seq, lmdks_cur)
|
|
|
|
# Track the minimum (best) evaluation
|
|
diff_min = float('inf')
|
|
|
|
point = ()
|
|
for r in reg:
|
|
# Evaluate current
|
|
eval_cur = eval_seq(get_rel_dists(seq, r, lmdks_cur))
|
|
|
|
# Compare evaluations
|
|
if abs(eval_cur - eval_orig) <= diff_min:
|
|
diff_min = abs(eval_cur - eval_orig)
|
|
point = r
|
|
|
|
# Save new point to landmarks
|
|
lmdks_cur = np.append(lmdks_cur, point)
|
|
lmdks_cur.sort()
|
|
|
|
# Add new option
|
|
opts.append(np.setdiff1d(lmdks_cur, lmdks))
|
|
|
|
return opts
|
|
|
|
def get_opts_from_top_h(seq, lmdks):
|
|
# Create histogram
|
|
hist, h = get_hist(seq, lmdks)
|
|
# Keep track of points
|
|
hist_cur = np.copy(hist)
|
|
# The options to be returned
|
|
hist_opts = []
|
|
# Keep adding points until the maximum is reached
|
|
while np.sum(hist_cur) < len(seq):
|
|
# Track the minimum (best) evaluation
|
|
diff_min = float('inf')
|
|
# The candidate option
|
|
hist_cand = np.copy(hist_cur)
|
|
# Check every possibility
|
|
for i, h_i in enumerate(hist_cur):
|
|
# Can we add one more point?
|
|
if h_i + 1 <= h:
|
|
hist_tmp = np.copy(hist_cur)
|
|
hist_tmp[i] += 1
|
|
# Find difference from original
|
|
diff_cur = get_norm(hist, hist_tmp) # Euclidean
|
|
# diff_cur = get_emd(hist, hist_tmp) # Wasserstein
|
|
# Remember if it is the best that you've seen
|
|
if diff_cur < diff_min:
|
|
diff_min = diff_cur
|
|
hist_cand = np.copy(hist_tmp)
|
|
# Update current histogram
|
|
hist_cur = np.copy(hist_cand)
|
|
# Add current best to options
|
|
hist_opts.append(hist_cand)
|
|
# Return options
|
|
return hist_opts
|
|
|
|
|
|
def get_non_opts_from_top(seq, lmdks):
|
|
# Evaluate the original
|
|
eval_orig = eval_seq(get_rel_dists(seq, [], lmdks))
|
|
|
|
non_opts = []
|
|
lmdks_cur = np.array(lmdks)
|
|
while not np.array_equal(lmdks_cur, seq):
|
|
# Find the combinations for one more point
|
|
reg = get_reg(seq, lmdks_cur)
|
|
|
|
# Track the maximum (worst) evaluation
|
|
diff_max = .0
|
|
|
|
point = ()
|
|
for r in reg:
|
|
# Evaluate current
|
|
eval_cur = eval_seq(get_rel_dists(seq, r, lmdks_cur))
|
|
|
|
# Compare evaluations
|
|
if abs(eval_cur - eval_orig) >= diff_max:
|
|
diff_max = abs(eval_cur - eval_orig)
|
|
point = r
|
|
|
|
# Save new point to landmarks
|
|
lmdks_cur = np.append(lmdks_cur, point)
|
|
lmdks_cur.sort()
|
|
|
|
# Add new option
|
|
non_opts.append(np.setdiff1d(lmdks_cur, lmdks))
|
|
|
|
return non_opts
|
|
|
|
|
|
'''
|
|
Finds a set of regular points from bottom (many) to top (less)
|
|
(seems to perform worse than the top-to-bottom approach).
|
|
|
|
Parameters:
|
|
seq - The point sequence.
|
|
lmdks - The landmarks.
|
|
Returns:
|
|
The resulting set of options.
|
|
|
|
Requirements:
|
|
For all possible sets of regular points n such that n = len(seq) - len(lmdks).
|
|
The result is a set of options for every possible value of n and for all possible combinations for each n.
|
|
Time - O(n^2)
|
|
Space - O(n^2)
|
|
'''
|
|
def get_opts_from_bottom(seq, lmdks):
|
|
# Evaluate the original
|
|
eval_orig = eval_seq(get_rel_dists(seq, [], lmdks))
|
|
|
|
# Start with all regular points as landmarks
|
|
lmdks_cur = np.array(get_reg(seq, lmdks))
|
|
|
|
opts = [lmdks_cur]
|
|
while lmdks_cur.size != 1:
|
|
# Track the minimum (best) evaluation
|
|
diff_min = float('inf')
|
|
|
|
point = ()
|
|
for lmdk in lmdks_cur:
|
|
# Evaluate current by removing one point
|
|
eval_cur = eval_seq(get_rel_dists(seq, [], np.setdiff1d(lmdks_cur, lmdk)))
|
|
|
|
# Compare evaluations
|
|
if abs(eval_cur - eval_orig) <= diff_min:
|
|
diff_min = abs(eval_cur - eval_orig)
|
|
point = lmdk
|
|
|
|
# Remove point from landmarks
|
|
lmdks_cur = np.setdiff1d(lmdks_cur, point)
|
|
|
|
# Add new option
|
|
opts.append(np.setdiff1d(lmdks_cur, lmdks))
|
|
|
|
return opts
|
|
|
|
|
|
def get_opts_from_bottom_h(seq, lmdks):
|
|
# Create histogram
|
|
hist, h = get_hist(seq, lmdks)
|
|
# Keep track of points
|
|
hist_cur = np.array([h]*len(hist))
|
|
while np.sum(hist_cur) > len(seq):
|
|
hist_cur[-1] -= 1
|
|
# The options to be returned
|
|
hist_opts = []
|
|
# Keep removing points until the minimum is reached
|
|
while np.sum(hist_cur) > np.sum(hist):
|
|
# Track the minimum (best) evaluation
|
|
diff_min = float('inf')
|
|
# The candidate option
|
|
hist_cand = np.copy(hist_cur)
|
|
# Check every possibility
|
|
for i, h_i in enumerate(hist_cur):
|
|
# Can we remove one more point?
|
|
if h_i - 1 >= hist[i]:
|
|
hist_tmp = np.copy(hist_cur)
|
|
hist_tmp[i] -= 1
|
|
# Find difference from original
|
|
# diff_cur = get_norm(hist, hist_tmp) # Euclidean
|
|
diff_cur = get_emd(hist, hist_tmp) # Wasserstein
|
|
# Remember if it is the best that you've seen
|
|
if diff_cur < diff_min:
|
|
diff_min = diff_cur
|
|
hist_cand = np.copy(hist_tmp)
|
|
# Update current histogram
|
|
hist_cur = np.copy(hist_cand)
|
|
# Add current best to options
|
|
hist_opts.append(hist_cand)
|
|
# Return options
|
|
return hist_opts
|
|
|
|
def get_non_opts_from_bottom(seq, lmdks):
|
|
# Evaluate the original
|
|
eval_orig = eval_seq(get_rel_dists(seq, [], lmdks))
|
|
|
|
# Start with all regular points as landmarks
|
|
lmdks_cur = np.array(get_reg(seq, lmdks))
|
|
|
|
non_opts = [lmdks_cur]
|
|
while lmdks_cur.size != 1:
|
|
# Track the maximum (worst) evaluation
|
|
diff_max = .0
|
|
|
|
point = ()
|
|
for lmdk in lmdks_cur:
|
|
# Evaluate current by removing one point
|
|
eval_cur = eval_seq(get_rel_dists(seq, [], np.setdiff1d(lmdks_cur, lmdk)))
|
|
|
|
# Compare evaluations
|
|
if abs(eval_cur - eval_orig) >= diff_max:
|
|
diff_max = abs(eval_cur - eval_orig)
|
|
point = lmdk
|
|
|
|
# Remove point from landmarks
|
|
lmdks_cur = np.setdiff1d(lmdks_cur, point)
|
|
|
|
# Add new option
|
|
non_opts.append(np.setdiff1d(lmdks_cur, lmdks))
|
|
|
|
return non_opts
|
|
|
|
|
|
def find_lmdks(seq, lmdks, epsilon):
|
|
'''
|
|
Add dummy landmarks to original landmarks.
|
|
|
|
Parameters:
|
|
seq - All of the data points.
|
|
lmdks - The original landmarks.
|
|
epsilon - The available privacy budget.
|
|
|
|
Returns:
|
|
lmdks_new - The new landmarks.
|
|
The remaining privacy budget.
|
|
'''
|
|
# The new landmarks
|
|
lmdks_new = lmdks
|
|
# The privacy budget to use
|
|
eps_sel = 0
|
|
if len(lmdks) > 0 and len(seq) != len(lmdks):
|
|
# Get landmarks timestamps in sequence
|
|
lmdks_seq = find_lmdks_seq(seq, lmdks)
|
|
# Turn landmarks to histogram
|
|
hist, h = get_hist(get_seq(1, len(seq)), lmdks_seq)
|
|
# Find all possible options
|
|
opts = get_opts_from_top_h(get_seq(1, len(seq)), lmdks_seq)
|
|
# Landmarks selection budget
|
|
eps_sel = epsilon/(len(lmdks_seq) + 1)
|
|
# Get landmarks histogram with dummy landmarks
|
|
hist_new, _ = exp_mech.exponential(hist, opts, exp_mech.score, 1.0, eps_sel)
|
|
# Split sequence in parts of size h
|
|
pt_idx = []
|
|
for idx in range(1, len(seq), h):
|
|
pt_idx.append([idx, idx + h - 1])
|
|
pt_idx[-1][1] = len(seq)
|
|
# Get new landmarks indexes
|
|
lmdks_seq_new = np.array([], dtype=int)
|
|
for i, pt in enumerate(pt_idx):
|
|
# Already landmarks
|
|
lmdks_seq_pt = lmdks_seq[(lmdks_seq >= pt[0]) & (lmdks_seq <= pt[1])]
|
|
# Sample randomly from the rest of the sequence
|
|
size = hist_new[i] - len(lmdks_seq_pt)
|
|
rglr = np.setdiff1d(np.arange(pt[0], pt[1] + 1), lmdks_seq_pt)
|
|
# Add already landmarks
|
|
lmdks_seq_new = np.concatenate([lmdks_seq_new, lmdks_seq_pt])
|
|
# Add new landmarks
|
|
if size > 0 and len(rglr) > size:
|
|
lmdks_seq_new = np.concatenate([lmdks_seq_new,
|
|
np.random.choice(
|
|
rglr,
|
|
size = size,
|
|
replace = False
|
|
)
|
|
])
|
|
# Get actual landmarks values
|
|
lmdks_new = seq[lmdks_seq_new - 1]
|
|
return lmdks_new, epsilon - eps_sel
|
|
|
|
|
|
def find_lmdks_eps(seq, lmdks, epsilon):
|
|
'''
|
|
Add dummy landmarks to original landmarks.
|
|
|
|
Parameters:
|
|
seq - All of the data points.
|
|
lmdks - The original landmarks.
|
|
epsilon - The available privacy budget.
|
|
|
|
Returns:
|
|
lmdks_new - The new landmarks.
|
|
'''
|
|
# The new landmarks
|
|
lmdks_new = lmdks
|
|
if len(lmdks) > 0 and len(seq) != len(lmdks):
|
|
# Get landmarks timestamps in sequence
|
|
lmdks_seq = find_lmdks_seq(seq, lmdks)
|
|
# Turn landmarks to histogram
|
|
hist, h = get_hist(get_seq(1, len(seq)), lmdks_seq)
|
|
# Find all possible options
|
|
opts = get_opts_from_top_h(get_seq(1, len(seq)), lmdks_seq)
|
|
# Get landmarks histogram with dummy landmarks
|
|
hist_new, _ = exp_mech.exponential(hist, opts, exp_mech.score, 1.0, epsilon)
|
|
# Split sequence in parts of size h
|
|
pt_idx = []
|
|
for idx in range(1, len(seq), h):
|
|
pt_idx.append([idx, idx + h - 1])
|
|
pt_idx[-1][1] = len(seq)
|
|
# Get new landmarks indexes
|
|
lmdks_seq_new = np.array([], dtype=int)
|
|
for i, pt in enumerate(pt_idx):
|
|
# Already landmarks
|
|
lmdks_seq_pt = lmdks_seq[(lmdks_seq >= pt[0]) & (lmdks_seq <= pt[1])]
|
|
# Sample randomly from the rest of the sequence
|
|
size = hist_new[i] - len(lmdks_seq_pt)
|
|
rglr = np.setdiff1d(np.arange(pt[0], pt[1] + 1), lmdks_seq_pt)
|
|
# Add already landmarks
|
|
lmdks_seq_new = np.concatenate([lmdks_seq_new, lmdks_seq_pt])
|
|
# Add new landmarks
|
|
if size > 0 and len(rglr) > size:
|
|
lmdks_seq_new = np.concatenate([lmdks_seq_new,
|
|
np.random.choice(
|
|
rglr,
|
|
size = size,
|
|
replace = False
|
|
)
|
|
])
|
|
# Get actual landmarks values
|
|
lmdks_new = seq[lmdks_seq_new - 1]
|
|
return lmdks_new
|
|
|
|
|
|
|
|
def test():
|
|
# Start and end points of the sequence
|
|
# # Nonrandom
|
|
# start = 1
|
|
# end = 10
|
|
# Random
|
|
start = 1
|
|
end = random.randint(start + 1, 100)
|
|
|
|
# Landmarks
|
|
# # Nonrandom
|
|
# lmdks = np.array([1, 3, 5, 8])
|
|
# Random
|
|
size = random.randint(start, end - 1)
|
|
lmdks = np.array(random.sample(range(start, end), size))
|
|
lmdks.sort()
|
|
|
|
# Print the parameters
|
|
print('Start : %d\n'
|
|
'End : %d\n'
|
|
'Size : %d\n'
|
|
'Landmarks: %s'
|
|
%(start, end, len(lmdks), str(lmdks)))
|
|
|
|
# Get the point sequence
|
|
seq = get_seq(start, end)
|
|
|
|
# Almost optimal solution
|
|
# print('\nOptimal...')
|
|
# t = time.time()
|
|
|
|
# opts_optim = get_opts_optim(seq, lmdks)
|
|
|
|
# print('Time:', time.time() - t, 'secs\n')
|
|
|
|
# print_rslt(seq, opts_optim, lmdks)
|
|
|
|
|
|
# Top to bottom approach
|
|
print('\nTop to bottom heuristic...')
|
|
t = time.time()
|
|
|
|
opts = get_opts_from_top(seq, lmdks)
|
|
|
|
print('Time:', time.time() - t, 'secs')
|
|
|
|
# print_rslt(seq, opts, lmdks)
|
|
diff_opt = print_diff(seq, opts, lmdks)
|
|
|
|
print('Non optimal version...')
|
|
non_opts = get_non_opts_from_top(seq, lmdks)
|
|
diff_non_opt = print_diff(seq, non_opts, lmdks)
|
|
print('Non optimal is %.2f%% different ([+]: worse | [-]: better).' %(100*(diff_non_opt - diff_opt)/diff_opt))
|
|
|
|
|
|
# Bottom to top approach
|
|
# Seems to perform worse
|
|
print('\nBottom to top heuristic...')
|
|
t = time.time()
|
|
|
|
opts = get_opts_from_bottom(seq, lmdks)
|
|
|
|
print('Time:', time.time() - t, 'secs')
|
|
|
|
# print_rslt(seq, opts, lmdks)
|
|
diff_opt = print_diff(seq, opts, lmdks)
|
|
|
|
print('Non optimal version...')
|
|
non_opts = get_non_opts_from_bottom(seq, lmdks)
|
|
diff_non_opt = print_diff(seq, non_opts, lmdks)
|
|
print('Non optimal is %.2f%% different ([+]: worse | [-]: better).' %(100*(diff_non_opt - diff_opt)/diff_opt))
|
|
|
|
|
|
# # Debugging
|
|
# # Number of desired actual and dummy landmarks
|
|
# k = len(lmdks) + 5
|
|
# # Number of dummy landmarks
|
|
# n = k - len(lmdks)
|
|
# combs = get_combs(reg, n)
|
|
# print_rslt(seq, combs, lmdks)
|
|
# exit()
|
|
|
|
|
|
if __name__ == '__main__':
|
|
try:
|
|
test()
|
|
except KeyboardInterrupt:
|
|
print('Interrupted by user.')
|
|
exit()
|