diff --git a/code/expt/expt_lmdk_sel.py b/code/expt/expt_lmdk_sel.py index 8fb7261..a273d2a 100644 --- a/code/expt/expt_lmdk_sel.py +++ b/code/expt/expt_lmdk_sel.py @@ -1,5 +1,7 @@ #!/usr/bin/env python3 +import sys +sys.path.insert(1, '../lib') import argparse import lmdk_lib import lmdk_sel @@ -36,7 +38,7 @@ def main(args): plt.xlim(x_i.min() - x_margin, x_i.max() + x_margin) # The y axis plt.ylabel('Mean absolute error') # Set y axis label. - plt.ylim(0, len(seq)/3) + # plt.ylim(0, len(seq)/3) # Bar offset x_offset = -(bar_width/2)*(len(epsilon) - 1) for e_i, e in enumerate(epsilon): @@ -45,9 +47,26 @@ def main(args): for r in range(args.reps): lmdks = lmdk_lib.get_lmdks(seq, n, d) hist, h = lmdk_lib.get_hist(seq, lmdks) - opts = lmdk_sel.get_opts_from_top_h(seq, lmdks) - delta = 1.0 - res, _ = exp_mech.exponential(hist, opts, exp_mech.score, delta, e) + res = np.zeros([len(hist)]) + # Split sequence in parts of size h + pt_idx = [] + for idx in range(h, len(seq), h): + pt_idx.append(idx) + seq_pt = np.split(seq, pt_idx) + for pt_i, pt in enumerate(seq_pt): + # Find this part's landmarks + lmdks_pt = np.intersect1d(pt, lmdks) + # Find possible options for this part + opts = lmdk_sel.get_opts_from_top_h(pt, lmdks_pt) + # Turn part to histogram + hist_pt, _ = lmdk_lib.get_hist(pt, lmdks_pt) + # Get an option for this part + res_pt = np.array([]) + if len(opts) > 0: + res_pt, _ = exp_mech.exponential(hist_pt, opts, exp_mech.score, 1.0, e) + # Merge options of all parts + res[pt_i] = np.sum(res_pt) + # Calculate MAE mae[n_i] += lmdk_lib.get_norm(hist, res)/args.reps # Plot bar for current epsilon plt.bar( @@ -59,7 +78,7 @@ def main(args): ) # Change offset for next bar x_offset += bar_width - path = str('/home/manos/Git/the-thing/code/expt_lmdk_sel/' + title) + path = str('../../rslt/lmdk_sel/' + title) # Plot legend lmdk_lib.plot_legend() # Show plot diff --git a/code/lib/exp_mech.py b/code/lib/exp_mech.py new file mode 100644 index 0000000..25c7ea5 --- /dev/null +++ b/code/lib/exp_mech.py @@ -0,0 +1,141 @@ +#!/usr/bin/env python3 + +import numpy as np +import math +import heapq +import itertools +import random +import scipy.stats as stats +import lmdk_lib +from matplotlib import pyplot as plt +import time + + +''' + The scoring function. + + Parameters: + data - The data. + option - The option to evaluate. + Returns: + The score for the option. +''' +def score(data, option): + return (option.sum() - data.sum()) + + +''' + The exponential mechanism. + + Parameters: + x - The data. + R - The possible outputs. + u - The scoring function. + delta - The sensitivity of the scoring function. + epsilon - The privacy budget. + Returns: + res - A randomly sampled output. + pr - The PDF of all possible outputs. +''' +def exponential(x, R, u, delta, epsilon): + # Calculate the score for each element of R + scores = [u(x, r) for r in R] + # Normalize the scores between 0 and 1 + # (the higher, the better the utility) + scores = 1 - (scores - np.min(scores))/(np.max(scores) - np.min(scores)) + + # Calculate the probability for each element, based on its score + pr = [np.exp(epsilon*score/(2*delta)) for score in scores] + + # Normalize the probabilities so that they sum to 1 + pr = pr/np.linalg.norm(pr, ord = 1) + + # Debugging + # print(R[np.argmax(pr)], pr.max(), scores[np.argmax(pr)]) + # print(R[np.argmin(pr)], pr.min(), scores[np.argmin(pr)]) + # print(abs(pr.max() - pr.min()), abs(scores[np.argmax(pr)] - scores[np.argmin(pr)])) + + # Choose an element from R based on the probabilities + if len(pr) > 0: + return R[np.random.choice(range(len(R)), 1, p = pr)[0]], pr + else: + return np.array([]), pr + + +def main(): + start, end = 1.0, 10.0 + scale = 1.0 + locs = [2.0, 4.0, 8.0] + k = len(locs) + + # dists = [truncnorm(start, end, loc, scale) for loc in locs] + dists = [stats.laplace(loc, scale) for loc in locs] + + mix = lmdk_lib.MixtureModel(dists) + + delta = 1.0 + epsilon = 10.0 + + combos = list(itertools.combinations(range(int(start), int(end) + 1), k)) + + res, pr = exponential(mix, combos, score, delta, epsilon) + + plt.rc('font', family='serif') + plt.rc('font', size=10) + plt.rc('text', usetex=True) + + # Plot the options' probabilities. + # pr.sort() + # n = np.arange(1.0, len(pr) + 1.0, 1.0) + # plt.plot(n,\ + # pr,\ + # label=r'$\textrm{Pr}$',\ + # color='blue') + # # Configure the plot. + # plt.axis([1.0, len(pr), 0.0, max(pr)]) # Set plot box. + # plt.legend(loc='best', frameon=False) # Set plot legend. + # plt.grid(axis='y', alpha=1.0) # Add grid on y axis. + # plt.xlabel('Options') # Set x axis label. + # plt.ylabel('Likelihood') # Set y axis label. + # plt.show() # Show the plot in a new window. + + x = np.arange(start, end, 0.01) + plt.plot(x,\ + mix.pdf(x),\ + label=r'$\textrm{Mix}$',\ + color='red') + + # print(mix.sample(start, end, 10)) + + # Test MixtureModel's sample function + # t = 1000 + # c = np.array(int(end)*[0]) + # for i in range(t): + # c[mix.sample(start, end, 1)[0] - 1] += 1 + + # plt.plot(range(int(start), int(end) + 1),\ + # c/t,\ + # label=r'$\textrm{Test}$',\ + # color='blue') + + # Configure the plot. + plt.axis([start, end, 0.0, 1.0]) # Set plot box. + plt.legend(loc='best', frameon=False) # Set plot legend. + plt.grid(axis='y', alpha=1.0) # Add grid on y axis. + plt.xlabel('Timestamp') # Set x axis label. + plt.ylabel('Likelihood') # Set y axis label. + + plt.show() # Show the plot in a new window. + + +if __name__ == '__main__': + try: + start_time = time.time() + main() + end_time = time.time() + print('##############################') + print('Time : %.4fs' % (end_time - start_time)) + print('##############################') + except KeyboardInterrupt: + print('Interrupted by user.') + exit() diff --git a/code/lib/lmdk_sel.py b/code/lib/lmdk_sel.py new file mode 100644 index 0000000..2685844 --- /dev/null +++ b/code/lib/lmdk_sel.py @@ -0,0 +1,388 @@ +#!/usr/bin/env python3 + +import itertools +from lmdk_lib import * +import numpy as np +import random +import time + + +''' + Print all the points. + + Parameters: + seq - The point sequence. + combs - All the possible point combinations for a specified size. + lmdks - The landmarks. + Returns: + Nothing. +''' +def print_rslt(seq, combs, lmdks): + eval_sum = .0 + for idx, c in enumerate(combs): + rslt = str(idx + 1) + ':\t' + for i in seq: + # Selected + if i in c: + rslt += '(' + str(i) + ')\t' + # Landmark + elif i in lmdks: + rslt += '*' + str(i) + '*\t' + # Not selected + else: + rslt += ' ' + str(i) + ' \t' + dists = get_rel_dists(seq, list(c), lmdks) + eval_orig = eval_seq(get_rel_dists(seq, [], lmdks)) + eval_cur = eval_seq(dists) + eval_sum += eval_cur + # print(rslt, '\t', dists, '\t', sum(dists), '\t', eval_cur) + print(rslt, eval_cur) + eval_avg = eval_sum/len(combs) + print('Average STD (difference with original): %.4f (%.2f%%)' %(eval_avg, 100*(eval_avg - eval_orig)/eval_orig)) + + +''' + Print the difference with the original. + + Parameters: + seq - The point sequence. + combs - All the possible point combinations for a specified size. + lmdks - The landmarks. + Returns: + The difference with the original. +''' +def print_diff(seq, combs, lmdks): + eval_sum = .0 + for c in combs: + dists = get_rel_dists(seq, list(c), lmdks) + eval_orig = eval_seq(get_rel_dists(seq, [], lmdks)) + eval_cur = eval_seq(dists) + eval_sum += eval_cur + eval_avg = eval_sum/len(combs) + diff = 100*(eval_avg - eval_orig)/eval_orig + print('Average STD (difference with original): %.4f (%.2f%%)' %(eval_avg, diff)) + return diff + + +''' + Finds the optimal set of regular points. + + Parameters: + seq - The point sequence. + lmdks - The landmarks. + Returns: + The optimal option. + + Requirements: + n = Regular points + r = The size of a combination + Time - O(C(n, r) + 2^C(n, r)) + Space - O(r*C(n, r)) +''' +def get_opts_optim(seq, lmdks): + # Evaluate the original + eval_orig = eval_seq(get_rel_dists(seq, [], lmdks)) + + # Get all possible options + opts = get_opts(seq, lmdks) + + # Evaluate options + # Track the minimum (best) evaluation + diff_min = float('inf') + + # Track the optimal sequence (the one with the best evaluation) + optim = [] + for opt in opts: + eval_sum = 0 + for o in opt: + eval_sum += eval_seq(get_rel_dists(seq, o, lmdks)) + # Compare with current optimal + diff_cur = abs(eval_sum/len(opt) - eval_orig) + if diff_cur < diff_min: + diff_min = diff_cur + optim = list(opt) + + return optim + + +''' + Finds a set of regular points from top (less) to bottom (many) + (seems to perform better than the bottom-to-top approach). + + Parameters: + seq - The point sequence. + lmdks - The landmarks. + Returns: + The resulting set of options. + + Requirements: + For all possible sets of regular points n such that n = len(seq) - len(lmdks). + The result is a set of options for every possible value of n and for all possible combinations for each n. + Time - O(n^2) + Space - O(n^2) +''' +def get_opts_from_top(seq, lmdks): + # Evaluate the original + eval_orig = eval_seq(get_rel_dists(seq, [], lmdks)) + + opts = [] + lmdks_cur = np.array(lmdks) + while not np.array_equal(lmdks_cur, seq): + # Find the combinations for one more point + reg = get_reg(seq, lmdks_cur) + + # Track the minimum (best) evaluation + diff_min = float('inf') + + point = () + for r in reg: + # Evaluate current + eval_cur = eval_seq(get_rel_dists(seq, r, lmdks_cur)) + + # Compare evaluations + if abs(eval_cur - eval_orig) <= diff_min: + diff_min = abs(eval_cur - eval_orig) + point = r + + # Save new point to landmarks + lmdks_cur = np.append(lmdks_cur, point) + lmdks_cur.sort() + + # Add new option + opts.append(np.setdiff1d(lmdks_cur, lmdks)) + + return opts + +def get_opts_from_top_h(seq, lmdks): + # Create histogram + hist, h = get_hist(seq, lmdks) + # Keep track of points + hist_cur = np.copy(hist) + # The options to be returned + hist_opts = [] + # Keep adding points until the maximum is reached + while np.sum(hist_cur) < max(seq): + # Track the minimum (best) evaluation + diff_min = float('inf') + # The candidate option + hist_cand = np.copy(hist_cur) + # Check every possibility + for i, h_i in enumerate(hist_cur): + # Can we add one more point? + if h_i + 1 <= h: + hist_tmp = np.copy(hist_cur) + hist_tmp[i] += 1 + # Find difference from original + diff_cur = get_norm(hist, hist_tmp) + # Remember if it is the best that you've seen + if diff_cur < diff_min: + diff_min = diff_cur + hist_cand = np.copy(hist_tmp) + # Update current histogram + hist_cur = np.copy(hist_cand) + # Add current best to options + hist_opts.append(hist_cand) + # Return options + return hist_opts + + +def get_non_opts_from_top(seq, lmdks): + # Evaluate the original + eval_orig = eval_seq(get_rel_dists(seq, [], lmdks)) + + non_opts = [] + lmdks_cur = np.array(lmdks) + while not np.array_equal(lmdks_cur, seq): + # Find the combinations for one more point + reg = get_reg(seq, lmdks_cur) + + # Track the maximum (worst) evaluation + diff_max = .0 + + point = () + for r in reg: + # Evaluate current + eval_cur = eval_seq(get_rel_dists(seq, r, lmdks_cur)) + + # Compare evaluations + if abs(eval_cur - eval_orig) >= diff_max: + diff_max = abs(eval_cur - eval_orig) + point = r + + # Save new point to landmarks + lmdks_cur = np.append(lmdks_cur, point) + lmdks_cur.sort() + + # Add new option + non_opts.append(np.setdiff1d(lmdks_cur, lmdks)) + + return non_opts + + +''' + Finds a set of regular points from bottom (many) to top (less) + (seems to perform worse than the top-to-bottom approach). + + Parameters: + seq - The point sequence. + lmdks - The landmarks. + Returns: + The resulting set of options. + + Requirements: + For all possible sets of regular points n such that n = len(seq) - len(lmdks). + The result is a set of options for every possible value of n and for all possible combinations for each n. + Time - O(n^2) + Space - O(n^2) +''' +def get_opts_from_bottom(seq, lmdks): + # Evaluate the original + eval_orig = eval_seq(get_rel_dists(seq, [], lmdks)) + + # Start with all regular points as landmarks + lmdks_cur = np.array(get_reg(seq, lmdks)) + + opts = [lmdks_cur] + while lmdks_cur.size != 1: + # Track the minimum (best) evaluation + diff_min = float('inf') + + point = () + for lmdk in lmdks_cur: + # Evaluate current by removing one point + eval_cur = eval_seq(get_rel_dists(seq, [], np.setdiff1d(lmdks_cur, lmdk))) + + # Compare evaluations + if abs(eval_cur - eval_orig) <= diff_min: + diff_min = abs(eval_cur - eval_orig) + point = lmdk + + # Remove point from landmarks + lmdks_cur = np.setdiff1d(lmdks_cur, point) + + # Add new option + opts.append(np.setdiff1d(lmdks_cur, lmdks)) + + return opts + + +def get_non_opts_from_bottom(seq, lmdks): + # Evaluate the original + eval_orig = eval_seq(get_rel_dists(seq, [], lmdks)) + + # Start with all regular points as landmarks + lmdks_cur = np.array(get_reg(seq, lmdks)) + + non_opts = [lmdks_cur] + while lmdks_cur.size != 1: + # Track the maximum (worst) evaluation + diff_max = .0 + + point = () + for lmdk in lmdks_cur: + # Evaluate current by removing one point + eval_cur = eval_seq(get_rel_dists(seq, [], np.setdiff1d(lmdks_cur, lmdk))) + + # Compare evaluations + if abs(eval_cur - eval_orig) >= diff_max: + diff_max = abs(eval_cur - eval_orig) + point = lmdk + + # Remove point from landmarks + lmdks_cur = np.setdiff1d(lmdks_cur, point) + + # Add new option + non_opts.append(np.setdiff1d(lmdks_cur, lmdks)) + + return non_opts + + +def test(): + # Start and end points of the sequence + # # Nonrandom + # start = 1 + # end = 10 + # Random + start = 1 + end = random.randint(start + 1, 100) + + # Landmarks + # # Nonrandom + # lmdks = np.array([1, 3, 5, 8]) + # Random + size = random.randint(start, end - 1) + lmdks = np.array(random.sample(range(start, end), size)) + lmdks.sort() + + # Print the parameters + print('Start : %d\n' + 'End : %d\n' + 'Size : %d\n' + 'Landmarks: %s' + %(start, end, len(lmdks), str(lmdks))) + + # Get the point sequence + seq = get_seq(start, end) + + # Almost optimal solution + # print('\nOptimal...') + # t = time.time() + + # opts_optim = get_opts_optim(seq, lmdks) + + # print('Time:', time.time() - t, 'secs\n') + + # print_rslt(seq, opts_optim, lmdks) + + + # Top to bottom approach + print('\nTop to bottom heuristic...') + t = time.time() + + opts = get_opts_from_top(seq, lmdks) + + print('Time:', time.time() - t, 'secs') + + # print_rslt(seq, opts, lmdks) + diff_opt = print_diff(seq, opts, lmdks) + + print('Non optimal version...') + non_opts = get_non_opts_from_top(seq, lmdks) + diff_non_opt = print_diff(seq, non_opts, lmdks) + print('Non optimal is %.2f%% different ([+]: worse | [-]: better).' %(100*(diff_non_opt - diff_opt)/diff_opt)) + + + # Bottom to top approach + # Seems to perform worse + print('\nBottom to top heuristic...') + t = time.time() + + opts = get_opts_from_bottom(seq, lmdks) + + print('Time:', time.time() - t, 'secs') + + # print_rslt(seq, opts, lmdks) + diff_opt = print_diff(seq, opts, lmdks) + + print('Non optimal version...') + non_opts = get_non_opts_from_bottom(seq, lmdks) + diff_non_opt = print_diff(seq, non_opts, lmdks) + print('Non optimal is %.2f%% different ([+]: worse | [-]: better).' %(100*(diff_non_opt - diff_opt)/diff_opt)) + + + # # Debugging + # # Number of desired actual and dummy landmarks + # k = len(lmdks) + 5 + # # Number of dummy landmarks + # n = k - len(lmdks) + # combs = get_combs(reg, n) + # print_rslt(seq, combs, lmdks) + # exit() + + +if __name__ == '__main__': + try: + test() + except KeyboardInterrupt: + print('Interrupted by user.') + exit()