the-last-thing/code/lib/lmdk_sel.py

422 lines
11 KiB
Python

#!/usr/bin/env python3
import itertools
from lmdk_lib import *
import exp_mech
import numpy as np
import random
import time
'''
Print all the points.
Parameters:
seq - The point sequence.
combs - All the possible point combinations for a specified size.
lmdks - The landmarks.
Returns:
Nothing.
'''
def print_rslt(seq, combs, lmdks):
eval_sum = .0
for idx, c in enumerate(combs):
rslt = str(idx + 1) + ':\t'
for i in seq:
# Selected
if i in c:
rslt += '(' + str(i) + ')\t'
# Landmark
elif i in lmdks:
rslt += '*' + str(i) + '*\t'
# Not selected
else:
rslt += ' ' + str(i) + ' \t'
dists = get_rel_dists(seq, list(c), lmdks)
eval_orig = eval_seq(get_rel_dists(seq, [], lmdks))
eval_cur = eval_seq(dists)
eval_sum += eval_cur
# print(rslt, '\t', dists, '\t', sum(dists), '\t', eval_cur)
print(rslt, eval_cur)
eval_avg = eval_sum/len(combs)
print('Average STD (difference with original): %.4f (%.2f%%)' %(eval_avg, 100*(eval_avg - eval_orig)/eval_orig))
'''
Print the difference with the original.
Parameters:
seq - The point sequence.
combs - All the possible point combinations for a specified size.
lmdks - The landmarks.
Returns:
The difference with the original.
'''
def print_diff(seq, combs, lmdks):
eval_sum = .0
for c in combs:
dists = get_rel_dists(seq, list(c), lmdks)
eval_orig = eval_seq(get_rel_dists(seq, [], lmdks))
eval_cur = eval_seq(dists)
eval_sum += eval_cur
eval_avg = eval_sum/len(combs)
diff = 100*(eval_avg - eval_orig)/eval_orig
print('Average STD (difference with original): %.4f (%.2f%%)' %(eval_avg, diff))
return diff
'''
Finds the optimal set of regular points.
Parameters:
seq - The point sequence.
lmdks - The landmarks.
Returns:
The optimal option.
Requirements:
n = Regular points
r = The size of a combination
Time - O(C(n, r) + 2^C(n, r))
Space - O(r*C(n, r))
'''
def get_opts_optim(seq, lmdks):
# Evaluate the original
eval_orig = eval_seq(get_rel_dists(seq, [], lmdks))
# Get all possible options
opts = get_opts(seq, lmdks)
# Evaluate options
# Track the minimum (best) evaluation
diff_min = float('inf')
# Track the optimal sequence (the one with the best evaluation)
optim = []
for opt in opts:
eval_sum = 0
for o in opt:
eval_sum += eval_seq(get_rel_dists(seq, o, lmdks))
# Compare with current optimal
diff_cur = abs(eval_sum/len(opt) - eval_orig)
if diff_cur < diff_min:
diff_min = diff_cur
optim = list(opt)
return optim
'''
Finds a set of regular points from top (less) to bottom (many)
(seems to perform better than the bottom-to-top approach).
Parameters:
seq - The point sequence.
lmdks - The landmarks.
Returns:
The resulting set of options.
Requirements:
For all possible sets of regular points n such that n = len(seq) - len(lmdks).
The result is a set of options for every possible value of n and for all possible combinations for each n.
Time - O(n^2)
Space - O(n^2)
'''
def get_opts_from_top(seq, lmdks):
# Evaluate the original
eval_orig = eval_seq(get_rel_dists(seq, [], lmdks))
opts = []
lmdks_cur = np.array(lmdks)
while not np.array_equal(lmdks_cur, seq):
# Find the combinations for one more point
reg = get_reg(seq, lmdks_cur)
# Track the minimum (best) evaluation
diff_min = float('inf')
point = ()
for r in reg:
# Evaluate current
eval_cur = eval_seq(get_rel_dists(seq, r, lmdks_cur))
# Compare evaluations
if abs(eval_cur - eval_orig) <= diff_min:
diff_min = abs(eval_cur - eval_orig)
point = r
# Save new point to landmarks
lmdks_cur = np.append(lmdks_cur, point)
lmdks_cur.sort()
# Add new option
opts.append(np.setdiff1d(lmdks_cur, lmdks))
return opts
def get_opts_from_top_h(seq, lmdks):
# Create histogram
hist, h = get_hist(seq, lmdks)
# Keep track of points
hist_cur = np.copy(hist)
# The options to be returned
hist_opts = []
# Keep adding points until the maximum is reached
while np.sum(hist_cur) < max(seq):
# Track the minimum (best) evaluation
diff_min = float('inf')
# The candidate option
hist_cand = np.copy(hist_cur)
# Check every possibility
for i, h_i in enumerate(hist_cur):
# Can we add one more point?
if h_i + 1 <= h:
hist_tmp = np.copy(hist_cur)
hist_tmp[i] += 1
# Find difference from original
diff_cur = get_norm(hist, hist_tmp)
# Remember if it is the best that you've seen
if diff_cur < diff_min:
diff_min = diff_cur
hist_cand = np.copy(hist_tmp)
# Update current histogram
hist_cur = np.copy(hist_cand)
# Add current best to options
hist_opts.append(hist_cand)
# Return options
return hist_opts
def get_non_opts_from_top(seq, lmdks):
# Evaluate the original
eval_orig = eval_seq(get_rel_dists(seq, [], lmdks))
non_opts = []
lmdks_cur = np.array(lmdks)
while not np.array_equal(lmdks_cur, seq):
# Find the combinations for one more point
reg = get_reg(seq, lmdks_cur)
# Track the maximum (worst) evaluation
diff_max = .0
point = ()
for r in reg:
# Evaluate current
eval_cur = eval_seq(get_rel_dists(seq, r, lmdks_cur))
# Compare evaluations
if abs(eval_cur - eval_orig) >= diff_max:
diff_max = abs(eval_cur - eval_orig)
point = r
# Save new point to landmarks
lmdks_cur = np.append(lmdks_cur, point)
lmdks_cur.sort()
# Add new option
non_opts.append(np.setdiff1d(lmdks_cur, lmdks))
return non_opts
'''
Finds a set of regular points from bottom (many) to top (less)
(seems to perform worse than the top-to-bottom approach).
Parameters:
seq - The point sequence.
lmdks - The landmarks.
Returns:
The resulting set of options.
Requirements:
For all possible sets of regular points n such that n = len(seq) - len(lmdks).
The result is a set of options for every possible value of n and for all possible combinations for each n.
Time - O(n^2)
Space - O(n^2)
'''
def get_opts_from_bottom(seq, lmdks):
# Evaluate the original
eval_orig = eval_seq(get_rel_dists(seq, [], lmdks))
# Start with all regular points as landmarks
lmdks_cur = np.array(get_reg(seq, lmdks))
opts = [lmdks_cur]
while lmdks_cur.size != 1:
# Track the minimum (best) evaluation
diff_min = float('inf')
point = ()
for lmdk in lmdks_cur:
# Evaluate current by removing one point
eval_cur = eval_seq(get_rel_dists(seq, [], np.setdiff1d(lmdks_cur, lmdk)))
# Compare evaluations
if abs(eval_cur - eval_orig) <= diff_min:
diff_min = abs(eval_cur - eval_orig)
point = lmdk
# Remove point from landmarks
lmdks_cur = np.setdiff1d(lmdks_cur, point)
# Add new option
opts.append(np.setdiff1d(lmdks_cur, lmdks))
return opts
def get_non_opts_from_bottom(seq, lmdks):
# Evaluate the original
eval_orig = eval_seq(get_rel_dists(seq, [], lmdks))
# Start with all regular points as landmarks
lmdks_cur = np.array(get_reg(seq, lmdks))
non_opts = [lmdks_cur]
while lmdks_cur.size != 1:
# Track the maximum (worst) evaluation
diff_max = .0
point = ()
for lmdk in lmdks_cur:
# Evaluate current by removing one point
eval_cur = eval_seq(get_rel_dists(seq, [], np.setdiff1d(lmdks_cur, lmdk)))
# Compare evaluations
if abs(eval_cur - eval_orig) >= diff_max:
diff_max = abs(eval_cur - eval_orig)
point = lmdk
# Remove point from landmarks
lmdks_cur = np.setdiff1d(lmdks_cur, point)
# Add new option
non_opts.append(np.setdiff1d(lmdks_cur, lmdks))
return non_opts
def find_lmdks(seq, lmdks, epsilon):
'''
Add dummy landmarks to original landmarks.
Parameters:
seq - All of the data points.
lmdks - The original landmarks.
epsilon - The available privacy budget.
Returns:
lmdks_new - The new landmarks.
The remaining privacy budget.
'''
# The new landmarks
lmdks_new = lmdks
# The privacy budget to use
eps_sel = 0
if len(lmdks) > 0 and len(seq) != len(lmdks):
# Get landmarks timestamps in sequence
lmdks_seq = find_lmdks_seq(seq, lmdks)
# Turn landmarks to histogram
hist, h = get_hist(get_seq(1, len(seq)), lmdks_seq)
# Find all possible options
opts = get_opts_from_top_h(get_seq(1, len(seq)), lmdks_seq)
# Landmarks selection budget
eps_sel = epsilon/(len(lmdks_seq) + 1)
# Get private landmarks timestamps
lmdks_seq, _ = exp_mech.exponential(hist, opts, exp_mech.score, 1.0, eps_sel)
# Get actual landmarks values
lmdks_new = seq[lmdks_seq]
return lmdks_new, epsilon - eps_sel
def test():
# Start and end points of the sequence
# # Nonrandom
# start = 1
# end = 10
# Random
start = 1
end = random.randint(start + 1, 100)
# Landmarks
# # Nonrandom
# lmdks = np.array([1, 3, 5, 8])
# Random
size = random.randint(start, end - 1)
lmdks = np.array(random.sample(range(start, end), size))
lmdks.sort()
# Print the parameters
print('Start : %d\n'
'End : %d\n'
'Size : %d\n'
'Landmarks: %s'
%(start, end, len(lmdks), str(lmdks)))
# Get the point sequence
seq = get_seq(start, end)
# Almost optimal solution
# print('\nOptimal...')
# t = time.time()
# opts_optim = get_opts_optim(seq, lmdks)
# print('Time:', time.time() - t, 'secs\n')
# print_rslt(seq, opts_optim, lmdks)
# Top to bottom approach
print('\nTop to bottom heuristic...')
t = time.time()
opts = get_opts_from_top(seq, lmdks)
print('Time:', time.time() - t, 'secs')
# print_rslt(seq, opts, lmdks)
diff_opt = print_diff(seq, opts, lmdks)
print('Non optimal version...')
non_opts = get_non_opts_from_top(seq, lmdks)
diff_non_opt = print_diff(seq, non_opts, lmdks)
print('Non optimal is %.2f%% different ([+]: worse | [-]: better).' %(100*(diff_non_opt - diff_opt)/diff_opt))
# Bottom to top approach
# Seems to perform worse
print('\nBottom to top heuristic...')
t = time.time()
opts = get_opts_from_bottom(seq, lmdks)
print('Time:', time.time() - t, 'secs')
# print_rslt(seq, opts, lmdks)
diff_opt = print_diff(seq, opts, lmdks)
print('Non optimal version...')
non_opts = get_non_opts_from_bottom(seq, lmdks)
diff_non_opt = print_diff(seq, non_opts, lmdks)
print('Non optimal is %.2f%% different ([+]: worse | [-]: better).' %(100*(diff_non_opt - diff_opt)/diff_opt))
# # Debugging
# # Number of desired actual and dummy landmarks
# k = len(lmdks) + 5
# # Number of dummy landmarks
# n = k - len(lmdks)
# combs = get_combs(reg, n)
# print_rslt(seq, combs, lmdks)
# exit()
if __name__ == '__main__':
try:
test()
except KeyboardInterrupt:
print('Interrupted by user.')
exit()