#!/usr/bin/env python3 import itertools from lmdk_lib import * import exp_mech import numpy as np import random import scipy.stats as stats import time ''' Print all the points. Parameters: seq - The point sequence. combs - All the possible point combinations for a specified size. lmdks - The landmarks. Returns: Nothing. ''' def print_rslt(seq, combs, lmdks): eval_sum = .0 for idx, c in enumerate(combs): rslt = str(idx + 1) + ':\t' for i in seq: # Selected if i in c: rslt += '(' + str(i) + ')\t' # Landmark elif i in lmdks: rslt += '*' + str(i) + '*\t' # Not selected else: rslt += ' ' + str(i) + ' \t' dists = get_rel_dists(seq, list(c), lmdks) eval_orig = eval_seq(get_rel_dists(seq, [], lmdks)) eval_cur = eval_seq(dists) eval_sum += eval_cur # print(rslt, '\t', dists, '\t', sum(dists), '\t', eval_cur) print(rslt, eval_cur) eval_avg = eval_sum/len(combs) print('Average STD (difference with original): %.4f (%.2f%%)' %(eval_avg, 100*(eval_avg - eval_orig)/eval_orig)) ''' Print the difference with the original. Parameters: seq - The point sequence. combs - All the possible point combinations for a specified size. lmdks - The landmarks. Returns: The difference with the original. ''' def print_diff(seq, combs, lmdks): eval_sum = .0 for c in combs: dists = get_rel_dists(seq, list(c), lmdks) eval_orig = eval_seq(get_rel_dists(seq, [], lmdks)) eval_cur = eval_seq(dists) eval_sum += eval_cur eval_avg = eval_sum/len(combs) diff = 100*(eval_avg - eval_orig)/eval_orig print('Average STD (difference with original): %.4f (%.2f%%)' %(eval_avg, diff)) return diff ''' Finds the optimal set of regular points. Parameters: seq - The point sequence. lmdks - The landmarks. Returns: The optimal option. Requirements: n = Regular points r = The size of a combination Time - O(C(n, r) + 2^C(n, r)) Space - O(r*C(n, r)) ''' def get_opts_optim(seq, lmdks): # Evaluate the original eval_orig = eval_seq(get_rel_dists(seq, [], lmdks)) # Get all possible options opts = get_opts(seq, lmdks) # Evaluate options # Track the minimum (best) evaluation diff_min = float('inf') # Track the optimal sequence (the one with the best evaluation) optim = [] for opt in opts: eval_sum = 0 for o in opt: eval_sum += eval_seq(get_rel_dists(seq, o, lmdks)) # Compare with current optimal diff_cur = abs(eval_sum/len(opt) - eval_orig) if diff_cur < diff_min: diff_min = diff_cur optim = list(opt) return optim ''' Finds a set of regular points from top (less) to bottom (many) (seems to perform better than the bottom-to-top approach). Parameters: seq - The point sequence. lmdks - The landmarks. Returns: The resulting set of options. Requirements: For all possible sets of regular points n such that n = len(seq) - len(lmdks). The result is a set of options for every possible value of n and for all possible combinations for each n. Time - O(n^2) Space - O(n^2) ''' def get_opts_from_top(seq, lmdks): # Evaluate the original eval_orig = eval_seq(get_rel_dists(seq, [], lmdks)) opts = [] lmdks_cur = np.array(lmdks) while not np.array_equal(lmdks_cur, seq): # Find the combinations for one more point reg = get_reg(seq, lmdks_cur) # Track the minimum (best) evaluation diff_min = float('inf') point = () for r in reg: # Evaluate current eval_cur = eval_seq(get_rel_dists(seq, r, lmdks_cur)) # Compare evaluations if abs(eval_cur - eval_orig) <= diff_min: diff_min = abs(eval_cur - eval_orig) point = r # Save new point to landmarks lmdks_cur = np.append(lmdks_cur, point) lmdks_cur.sort() # Add new option opts.append(np.setdiff1d(lmdks_cur, lmdks)) return opts def get_opts_from_top_h(seq, lmdks): # Create histogram hist, h = get_hist(seq, lmdks) # Keep track of points hist_cur = np.copy(hist) # The options to be returned hist_opts = [] # Keep adding points until the maximum is reached while np.sum(hist_cur) < len(seq): # Track the minimum (best) evaluation diff_min = float('inf') # The candidate option hist_cand = np.copy(hist_cur) # Check every possibility for i, h_i in enumerate(hist_cur): # Can we add one more point? if h_i + 1 <= h: hist_tmp = np.copy(hist_cur) hist_tmp[i] += 1 # Find difference from original diff_cur = get_norm(hist, hist_tmp) # Euclidean # diff_cur = get_emd(hist, hist_tmp) # Wasserstein # Remember if it is the best that you've seen if diff_cur < diff_min: diff_min = diff_cur hist_cand = np.copy(hist_tmp) # Update current histogram hist_cur = np.copy(hist_cand) # Add current best to options hist_opts.append(hist_cand) # Return options return hist_opts def get_non_opts_from_top(seq, lmdks): # Evaluate the original eval_orig = eval_seq(get_rel_dists(seq, [], lmdks)) non_opts = [] lmdks_cur = np.array(lmdks) while not np.array_equal(lmdks_cur, seq): # Find the combinations for one more point reg = get_reg(seq, lmdks_cur) # Track the maximum (worst) evaluation diff_max = .0 point = () for r in reg: # Evaluate current eval_cur = eval_seq(get_rel_dists(seq, r, lmdks_cur)) # Compare evaluations if abs(eval_cur - eval_orig) >= diff_max: diff_max = abs(eval_cur - eval_orig) point = r # Save new point to landmarks lmdks_cur = np.append(lmdks_cur, point) lmdks_cur.sort() # Add new option non_opts.append(np.setdiff1d(lmdks_cur, lmdks)) return non_opts ''' Finds a set of regular points from bottom (many) to top (less) (seems to perform worse than the top-to-bottom approach). Parameters: seq - The point sequence. lmdks - The landmarks. Returns: The resulting set of options. Requirements: For all possible sets of regular points n such that n = len(seq) - len(lmdks). The result is a set of options for every possible value of n and for all possible combinations for each n. Time - O(n^2) Space - O(n^2) ''' def get_opts_from_bottom(seq, lmdks): # Evaluate the original eval_orig = eval_seq(get_rel_dists(seq, [], lmdks)) # Start with all regular points as landmarks lmdks_cur = np.array(get_reg(seq, lmdks)) opts = [lmdks_cur] while lmdks_cur.size != 1: # Track the minimum (best) evaluation diff_min = float('inf') point = () for lmdk in lmdks_cur: # Evaluate current by removing one point eval_cur = eval_seq(get_rel_dists(seq, [], np.setdiff1d(lmdks_cur, lmdk))) # Compare evaluations if abs(eval_cur - eval_orig) <= diff_min: diff_min = abs(eval_cur - eval_orig) point = lmdk # Remove point from landmarks lmdks_cur = np.setdiff1d(lmdks_cur, point) # Add new option opts.append(np.setdiff1d(lmdks_cur, lmdks)) return opts def get_opts_from_bottom_h(seq, lmdks): # Create histogram hist, h = get_hist(seq, lmdks) # Keep track of points hist_cur = np.array([h]*len(hist)) while np.sum(hist_cur) > len(seq): hist_cur[-1] -= 1 # The options to be returned hist_opts = [] # Keep removing points until the minimum is reached while np.sum(hist_cur) > np.sum(hist): # Track the minimum (best) evaluation diff_min = float('inf') # The candidate option hist_cand = np.copy(hist_cur) # Check every possibility for i, h_i in enumerate(hist_cur): # Can we remove one more point? if h_i - 1 >= hist[i]: hist_tmp = np.copy(hist_cur) hist_tmp[i] -= 1 # Find difference from original # diff_cur = get_norm(hist, hist_tmp) # Euclidean diff_cur = get_emd(hist, hist_tmp) # Wasserstein # Remember if it is the best that you've seen if diff_cur < diff_min: diff_min = diff_cur hist_cand = np.copy(hist_tmp) # Update current histogram hist_cur = np.copy(hist_cand) # Add current best to options hist_opts.append(hist_cand) # Return options return hist_opts def get_non_opts_from_bottom(seq, lmdks): # Evaluate the original eval_orig = eval_seq(get_rel_dists(seq, [], lmdks)) # Start with all regular points as landmarks lmdks_cur = np.array(get_reg(seq, lmdks)) non_opts = [lmdks_cur] while lmdks_cur.size != 1: # Track the maximum (worst) evaluation diff_max = .0 point = () for lmdk in lmdks_cur: # Evaluate current by removing one point eval_cur = eval_seq(get_rel_dists(seq, [], np.setdiff1d(lmdks_cur, lmdk))) # Compare evaluations if abs(eval_cur - eval_orig) >= diff_max: diff_max = abs(eval_cur - eval_orig) point = lmdk # Remove point from landmarks lmdks_cur = np.setdiff1d(lmdks_cur, point) # Add new option non_opts.append(np.setdiff1d(lmdks_cur, lmdks)) return non_opts def find_lmdks(seq, lmdks, epsilon): ''' Add dummy landmarks to original landmarks. Parameters: seq - All of the data points. lmdks - The original landmarks. epsilon - The available privacy budget. Returns: lmdks_new - The new landmarks. The remaining privacy budget. ''' # The new landmarks lmdks_new = lmdks # The privacy budget to use eps_sel = 0 if len(lmdks) > 0 and len(seq) != len(lmdks): # Get landmarks timestamps in sequence lmdks_seq = find_lmdks_seq(seq, lmdks) # Turn landmarks to histogram hist, h = get_hist(get_seq(1, len(seq)), lmdks_seq) # Find all possible options opts = get_opts_from_top_h(get_seq(1, len(seq)), lmdks_seq) # Landmarks selection budget eps_sel = epsilon/(len(lmdks_seq) + 1) # Get landmarks histogram with dummy landmarks hist_new, _ = exp_mech.exponential(hist, opts, exp_mech.score, 1.0, eps_sel) # Split sequence in parts of size h pt_idx = [] for idx in range(1, len(seq), h): pt_idx.append([idx, idx + h - 1]) pt_idx[-1][1] = len(seq) # Get new landmarks indexes lmdks_seq_new = np.array([], dtype=int) for i, pt in enumerate(pt_idx): # Already landmarks lmdks_seq_pt = lmdks_seq[(lmdks_seq >= pt[0]) & (lmdks_seq <= pt[1])] # Sample randomly from the rest of the sequence size = hist_new[i] - len(lmdks_seq_pt) rglr = np.setdiff1d(np.arange(pt[0], pt[1] + 1), lmdks_seq_pt) # Add already landmarks lmdks_seq_new = np.concatenate([lmdks_seq_new, lmdks_seq_pt]) # Add new landmarks if size > 0 and len(rglr) > size: lmdks_seq_new = np.concatenate([lmdks_seq_new, np.random.choice( rglr, size = size, replace = False ) ]) # Get actual landmarks values lmdks_new = seq[lmdks_seq_new - 1] return lmdks_new, epsilon - eps_sel def find_lmdks_eps(seq, lmdks, epsilon): ''' Add dummy landmarks to original landmarks. Parameters: seq - All of the data points. lmdks - The original landmarks. epsilon - The available privacy budget. Returns: lmdks_new - The new landmarks. ''' # The new landmarks lmdks_new = lmdks if len(seq) != len(lmdks): # Get landmarks timestamps in sequence lmdks_seq = find_lmdks_seq(seq, lmdks) # Turn landmarks to histogram hist, h = get_hist(get_seq(1, len(seq)), lmdks_seq) # Find all possible options opts = get_opts_from_top_h(get_seq(1, len(seq)), lmdks_seq) # Get landmarks histogram with dummy landmarks hist_new, _ = exp_mech.exponential(hist, opts, exp_mech.score, 1.0, epsilon) # Split sequence in parts of size h pt_idx = [] for idx in range(1, len(seq), h): pt_idx.append([idx, idx + h - 1]) pt_idx[-1][1] = len(seq) # Get new landmarks indexes lmdks_seq_new = np.array([], dtype=int) for i, pt in enumerate(pt_idx): # Already landmarks lmdks_seq_pt = lmdks_seq[(lmdks_seq >= pt[0]) & (lmdks_seq <= pt[1])] # Sample randomly from the rest of the sequence size = int(hist_new[i] - len(lmdks_seq_pt)) rglr = np.setdiff1d(np.arange(pt[0], pt[1] + 1), lmdks_seq_pt) # Add already landmarks lmdks_seq_new = np.concatenate([lmdks_seq_new, lmdks_seq_pt]) # Add new landmarks if size > 0 and len(rglr) > size: lmdks_seq_new = np.concatenate([lmdks_seq_new, np.random.choice( rglr, size = size, replace = False ) ]) # Get actual landmarks values lmdks_new = seq[lmdks_seq_new - 1] return lmdks_new def test(): # Start and end points of the sequence # # Nonrandom # start = 1 # end = 10 # Random start = 1 end = random.randint(start + 1, 100) # Landmarks # # Nonrandom # lmdks = np.array([1, 3, 5, 8]) # Random size = random.randint(start, end - 1) lmdks = np.array(random.sample(range(start, end), size)) lmdks.sort() # Print the parameters print('Start : %d\n' 'End : %d\n' 'Size : %d\n' 'Landmarks: %s' %(start, end, len(lmdks), str(lmdks))) # Get the point sequence seq = get_seq(start, end) # Almost optimal solution # print('\nOptimal...') # t = time.time() # opts_optim = get_opts_optim(seq, lmdks) # print('Time:', time.time() - t, 'secs\n') # print_rslt(seq, opts_optim, lmdks) # Top to bottom approach print('\nTop to bottom heuristic...') t = time.time() opts = get_opts_from_top(seq, lmdks) print('Time:', time.time() - t, 'secs') # print_rslt(seq, opts, lmdks) diff_opt = print_diff(seq, opts, lmdks) print('Non optimal version...') non_opts = get_non_opts_from_top(seq, lmdks) diff_non_opt = print_diff(seq, non_opts, lmdks) print('Non optimal is %.2f%% different ([+]: worse | [-]: better).' %(100*(diff_non_opt - diff_opt)/diff_opt)) # Bottom to top approach # Seems to perform worse print('\nBottom to top heuristic...') t = time.time() opts = get_opts_from_bottom(seq, lmdks) print('Time:', time.time() - t, 'secs') # print_rslt(seq, opts, lmdks) diff_opt = print_diff(seq, opts, lmdks) print('Non optimal version...') non_opts = get_non_opts_from_bottom(seq, lmdks) diff_non_opt = print_diff(seq, non_opts, lmdks) print('Non optimal is %.2f%% different ([+]: worse | [-]: better).' %(100*(diff_non_opt - diff_opt)/diff_opt)) # # Debugging # # Number of desired actual and dummy landmarks # k = len(lmdks) + 5 # # Number of dummy landmarks # n = k - len(lmdks) # combs = get_combs(reg, n) # print_rslt(seq, combs, lmdks) # exit() if __name__ == '__main__': try: test() except KeyboardInterrupt: print('Interrupted by user.') exit()