diff --git a/code/expt/avg_dist.py b/code/expt/avg_dist.py new file mode 100644 index 0000000..941a8de --- /dev/null +++ b/code/expt/avg_dist.py @@ -0,0 +1,121 @@ +#!/usr/bin/env python3 + +import sys +sys.path.insert(1, '../lib') +import argparse +import gdp +import lmdk_lib +import math +from matplotlib import pyplot as plt +import numpy as np +import os +import time + + +def main(args): + # Number of timestamps + seq = lmdk_lib.get_seq(1, args.time) + # Distribution type + dist_type = np.array(range(0, 4)) + # Number of landmarks + lmdk_n = np.array(range(0, args.time + 1, int(args.time/5))) + + markers = [ + '^', # Symmetric + 'v', # Skewed + 'D', # Bimodal + 's' # Uniform + ] + + # Initialize plot + lmdk_lib.plot_init() + # The x axis + x_i = np.arange(len(lmdk_n)) + plt.xticks(x_i, ((lmdk_n/len(seq))*100).astype(int)) + plt.xlabel('Landmarks (%)') # Set x axis label. + plt.xlim(x_i.min(), x_i.max()) + # The y axis + plt.ylabel('Normalized average distance') # Set y axis label. + plt.yscale('log') + plt.ylim(.001, 1) + # Logging + print('Average distance', end='', flush=True) + for d_i, d in enumerate(dist_type): + avg_dist = np.zeros(len(lmdk_n)) + # Logging + print('.', end='', flush=True) + for i, n in enumerate(lmdk_n): + for r in range(args.reps): + # Generate landmarks + lmdks = lmdk_lib.get_lmdks(seq, n, d) + # Calculate average distance + avg_cur = 0 + for t in seq: + t_prv, t_nxt = gdp.get_limits(t, seq, lmdks) + avg_cur += (abs(t - t_prv) - 1 + abs(t - t_nxt) - 1 )/len(seq) + # Normalized average based on repetitions + avg_dist[i] += avg_cur/args.reps + # Rescaling (min-max normalization) + # https://en.wikipedia.org/wiki/Feature_scaling#Rescaling_(min-max_normalization) + avg_dist = (avg_dist - avg_dist.min())/(avg_dist.max() - avg_dist.min()) + # Normalize for log scale + if avg_dist[len(avg_dist) - 1] == 0: + avg_dist[len(avg_dist) - 1] = .001 + # Set label + label = lmdk_lib.dist_type_to_str(d_i) + if d_i == 1: + label = 'Skewed' + # Plot line + plt.plot( + x_i, + avg_dist, + label=label, + marker=markers[d_i], + markersize=lmdk_lib.marker_size, + markeredgewidth=0, + linewidth=lmdk_lib.line_width + ) + # Plot legend + lmdk_lib.plot_legend() + # Show plot + # plt.show() + # Save plot + lmdk_lib.save_plot(str('../../rslt/avg_dist/' + 'avg-dist' + '.pdf')) + print(' [OK]', flush=True) + + +''' + Parse arguments. + + Optional: + reps - The number of repetitions. + time - The time limit of the sequence. +''' +def parse_args(): + # Create argument parser. + parser = argparse.ArgumentParser() + + # Mandatory arguments. + + # Optional arguments. + parser.add_argument('-r', '--reps', help='The number of repetitions.', type=int, default=1) + parser.add_argument('-t', '--time', help='The time limit of the sequence.', type=int, default=100) + + # Parse arguments. + args = parser.parse_args() + + return args + + +if __name__ == '__main__': + try: + args = parse_args() + start_time = time.time() + main(args) + end_time = time.time() + print('##############################') + print('Time elapsed: %s' % (time.strftime('%H:%M:%S', time.gmtime(end_time - start_time)))) + print('##############################') + except KeyboardInterrupt: + print('Interrupted by user.') + exit() diff --git a/code/expt/dist_cor.py b/code/expt/dist_cor.py new file mode 100644 index 0000000..039c5db --- /dev/null +++ b/code/expt/dist_cor.py @@ -0,0 +1,131 @@ +#!/usr/bin/env python3 + +import sys +sys.path.insert(1, '../lib') +import argparse +import gdp +import itertools +import lmdk_bgt +import lmdk_lib +import numpy as np +import os +from matplotlib import pyplot as plt +import time + + +def main(args): + # Privacy goal + epsilon = 1.0 + # Number of timestamps + seq = lmdk_lib.get_seq(1, args.time) + # Correlation degree (higher values means weaker correlations) + cor_deg = np.array([.01, .1, 1.0]) + cor_lbl = ['Strong correlation', 'Moderate correlation', 'Weak correlation'] + # Distribution type + dist_type = np.array(range(0, 4)) + # Number of landmarks + lmdk_n = np.array(range(0, args.time + 1, int(args.time/5))) + # Width of bars + bar_width = 1/(len(dist_type) + 1) + # For each correlation degree + for c_i, c in enumerate(cor_deg): + # Logging + title = cor_lbl[c_i] + print('(%d/%d) %s' %(c_i + 1, len(cor_deg), title), end='', flush=True) + # The transition matrix + p = gdp.gen_trans_mt(2, c) + # Bar offset + x_offset = -(bar_width/2)*(len(dist_type) - 1) + # Initialize plot + lmdk_lib.plot_init() + # The x axis + x_i = np.arange(len(lmdk_n)) + plt.xticks(x_i, ((lmdk_n/len(seq))*100).astype(int)) + plt.xlabel('Landmarks (%)') # Set x axis label. + x_margin = bar_width*(len(dist_type)/2 + 1) + plt.xlim(x_i.min() - x_margin, x_i.max() + x_margin) + # The y axis + plt.ylabel('Privacy loss') # Set y axis label. + plt.yscale('log') + plt.ylim(epsilon/10, 100*len(seq)) + # plt.ylim(0, 10000) + for d_i, d in enumerate(dist_type): + print('.', end='', flush=True) + # Initialization + e = np.zeros(len(lmdk_n)) + a = np.zeros(len(lmdk_n)) + for i, n in enumerate(lmdk_n): + for r in range(args.reps): + # Generate landmarks + lmdks = lmdk_lib.get_lmdks(seq, n, d) + # Uniform budget allocation + e_cur = lmdk_bgt.uniform(seq, lmdks, epsilon) + _, _, a_cur = gdp.tpl_lmdk_mem(e_cur, p, p, seq, lmdks) + # Save privacy loss + e[i] += np.sum(e_cur)/args.reps + a[i] += np.sum(a_cur)/args.reps + # Set label + label = lmdk_lib.dist_type_to_str(d_i) + if d_i == 1: + label = 'Skewed' + # Plot bar for current distribution + plt.bar( + x_i + x_offset, + a, + bar_width, + label=label, + linewidth=lmdk_lib.line_width + ) + # Change offset for next bar + x_offset += bar_width + # Plot line for no correlation + plt.plot( + x_i, + e, + linewidth=lmdk_lib.line_width, + color='#e0e0e0', + ) + # Plot legend + lmdk_lib.plot_legend() + # Show plot + # plt.show() + # Save plot + lmdk_lib.save_plot(str('../../rslt/dist_cor/' + title + '.pdf')) + print(' [OK]', flush=True) + + +''' + Parse arguments. + + Optional: + reps - The number of repetitions. + time - The time limit of the sequence. +''' +def parse_args(): + # Create argument parser. + parser = argparse.ArgumentParser() + + # Mandatory arguments. + + # Optional arguments. + parser.add_argument('-r', '--reps', help='The number of repetitions.', type=int, default=1) + parser.add_argument('-t', '--time', help='The time limit of the sequence.', type=int, default=100) + + # Parse arguments. + args = parser.parse_args() + + return args + + +if __name__ == '__main__': + try: + args = parse_args() + start_time = time.time() + main(args) + end_time = time.time() + print('##############################') + print('Time elapsed: %s' % (time.strftime('%H:%M:%S', time.gmtime(end_time - start_time)))) + print('##############################') + except KeyboardInterrupt: + print('Interrupted by user.') + exit() diff --git a/code/lib/gdp.py b/code/lib/gdp.py new file mode 100644 index 0000000..8467e28 --- /dev/null +++ b/code/lib/gdp.py @@ -0,0 +1,1576 @@ +#!/usr/bin/env python3 + +import argparse +import collections +import csv +import datetime +import math +from matplotlib import pyplot as plt +import numpy as np +import os +import random +from scipy.sparse.linalg import eigs +from scipy.stats import laplace +from scipy.stats import norm +import sympy as sp +import time +import xxhash + + +# Debugging +DEBUG = False # Variable to check. + # When enabled print something. + + +# Memoization +MEM = {} # The cache. +MISS = 0 # Number of additions to the cache. +TOTAL = 0 # Number of cache accesses. + + +''' + Read data from a file. + + Parameters: + path - The relative path to the data file. + Returns: + data - A list of tuples [uid, timestamp, lng, lat, loc]. +''' +def load_data(path): + print('Loading data from', os.path.abspath(path), '... ', end='') + data = [] + try: + with open(path, 'r') as f: + reader = csv.reader(f, delimiter=',') + data = list(reader) + print('OK.') + return data + except IOError: + print('Oops!') + exit() + + +''' + Save output to a file. + + Parameters: + path - The relative path to the output file. + t - The number of timestamps. + e - The privacy budget at each timestamp. + a_b - The backward privacy loss at each timestamp. + a_f - The forward privacy loss at each timestamp. + a - The temporal privacy loss at each timestamp. + Returns: + Nothing. +''' +def save_output(path, t, e, a_b, a_f, a): + # timestamp = time.strftime('%Y%m%d%H%M%S') + print('Saving output to %s... ' %(path), end='', flush=True) + os.makedirs(os.path.dirname(path), exist_ok=True) + with open(path, 'w') as f: + w = csv.writer(f, delimiter=';') + w.writerow(['t', 'e', 'bpl', 'fpl', 'tpl']) + w.writerows(zip(list(range(1, t + 1)), e, a_b, a_f, a)) + print('OK.', flush=True) + + +''' + Get all the timestamps from the input data. + + Parameters: + data - The input data set. + Returns: + timestamps - An ndarray of all of the timestamps from the input data. +''' +def get_timestamps(data): + print('Getting a list of all timestamps... ', end='', flush=True) + timestamps = np.sort(np.unique(np.array(data)[:, 1])) + if not len(timestamps): + print('No timestamps found.', flush=True) + exit() + # Check timestamps' order. + prev = None + for t in timestamps: + curr = time.mktime(datetime.datetime.strptime(t, '%Y-%m-%d %H:%M').timetuple()) + if prev: + if curr < prev: + print('Wrong order.', flush=True) + exit() + prev = curr + print('OK.', flush=True) + if DEBUG: + print(timestamps, len(timestamps)) + return timestamps + + +''' + Get all the unique locations from the input data. + + Parameters: + data - The input data set. + Returns: + locs - A sorted ndarray of all the unique locations int the input data. +''' +def get_locs(data): + print('Getting a list of all locations... ', end='', flush=True) + locs = np.sort(np.unique(np.array(data)[:, 4].astype(np.int))) + if not len(locs): + print('No locations found.', flush=True) + exit() + print('OK.', flush=True) + if DEBUG: + print(list(map(str, locs)), len(locs)) + return list(map(str, locs)) + + +''' + Get the counts at every location for a specific timestamp. + + Parameters: + data - The input data set. + t - The timestamp of interest. + Returns: + cnts - A dict {loc:cnt} with the counts at every location for a specific timestamp. +''' +def get_cnts(data, t): + print('Getting all counts at %s... ' %(t), end='', flush=True) + locs = get_locs(data) + cnts = dict.fromkeys(locs, 0) + for d in data: + if d[1] == t: + cnts[int(d[4])] += 1 + print('OK.', flush=True) + if DEBUG: + print(cnts) + return cnts + + +''' + Get the counts at every location for every timestamp. + + Parameters: + data - The input data set. + Returns: + cnts - A dict {timestamp:loc} with all the counts at every location for every timestamp. +''' +def get_all_cnts(data): + cnts = {} + for d in data: + key = d[1] + '@' + d[4] + cnts[key] = cnts.get(key, 0) + 1 + if DEBUG: + print(cnts) + return cnts + + +''' + Get a list of unique users in the input data set. + + Parameters: + data - The input data set. + Returns: + users - An ndarray of all unique users. +''' +def get_usrs(data): + users = np.sort(np.unique(np.array(data)[:, 0].astype(np.int))) + if not len(users): + print('No users found.') + exit() + if DEBUG: + print(users, len(users)) + return users + + +''' + Get the data of a particular user from a data set. + + Parameters: + data - The input data set. + id - The user identifier. + Returns: + output - A list of the data of the targeted user. +''' +def get_usr_data(data, id): + output = [] + for d in data: + if (d[0] == str(id)): + output.append(d) + if DEBUG: + print(id, output) + return output + + +''' + Get the data of every user in a data set. + + Parameters: + data - The input data set. + Returns: + output - A dict {usr, [usr_data]} with the data of each user. +''' +def get_usrs_data(data): + output = {} + for d in data: + output[d[0]] = output.get(d[0], []) + [d] + return output + + +''' + Get the trajectory of a user from her data. + + Parameters: + data - The data of the user. + Returns: + traj - A list [(timestamp, loc)] with the locations and corresponding timestamps that the user was at. +''' +def get_usr_traj(data): + traj = [] + for d in data: + traj.append((d[1], d[4])) + if DEBUG: + print(traj) + return traj + + +''' + Get all the possible transitions. + + Parameters: + data - The input data set. + Returns: + trans - A set with all the possible forward transitions in the input. +''' +def get_poss_trans(data): + print('Getting possible transitions... ', end='', flush=True) + trans = set() + for u, u_data in data.items(): + traj = get_usr_traj(u_data) + trans.update(list(zip(traj, traj[1:]))) + if len(trans) == 0: + print('None.', flush=True) + exit() + print('OK.', flush=True) + return trans + + +''' + Get all backward transitions in a data set. + + Parameters: + data - The input data set. + Returns: + trans - A dict {(t, t-1):[transitions]} with all the backward transitions + at every sequential timestamp pair in the input data set. +''' +def get_bwd_trans(data): + print('Getting all backward transitions... ', end='', flush=True) + trans = {} + for u, u_data in data.items(): + traj = get_usr_traj(u_data) + for i, j in zip(traj[1:], traj): + trans[(i[0], j[0])] = trans.get((i[0], j[0]), []) + [(i[1], j[1])] + if len(trans) == 0: + print('None.', flush=True) + exit() + print('OK.', flush=True) + return trans + + +''' + Get all forward transitions in a data set. + + Parameters: + data - The input data set. + Returns: + trans - A dict {(t-1, t):[transitions]} with all the forward transitions + at every sequential timestamp pair in the input data set. +''' +def get_fwd_trans(data): + print('Getting all forward transitions... ', end='', flush=True) + trans = {} + for u, u_data in data.items(): + traj = get_usr_traj(u_data) + for i, j in zip(traj, traj[1:]): + trans[(i[0], j[0])] = trans.get((i[0], j[0]), []) + [(i[1], j[1])] + if len(trans) == 0: + print('None.', flush=True) + exit() + print('OK.', flush=True) + return trans + + +''' + Divide two numbers. If the divisor is 0 return inf. + + Parameters: + a - The dividend. + b - The divisor. + Returns: + The float result of the division. +''' +def safe_div(a, b): + if b == 0: + return math.inf + return float(a/b) + + +''' + Calculate the maximum value of the objective function. + + Parameters: + q - A row from the transition matrix. + d - Another row from the transition matrix. + a - The backward/forward privacy loss of the previous/next + timestamp. + Returns: + The maximum value of the objective function. +''' +def max_val(q, d, a): + if a == math.inf: + return math.nan + return (q*(math.exp(a) - 1) + 1)/(d*(math.exp(a) - 1) + 1) + + +''' + Find two different rows (q and d) of a transition matrix (p) + that maximize the product of the objective function and return + their sums. + + Parameters: + p - The transition matrix representing the backward/forward + correlations. + a - The backward/forward privacy loss of the previous/next + timestamp. + Returns: + sum_q - The sum of the elements of q. + sum_d - The sum of the elements of d. +''' +def find_qd(p, a): + res = 0.0 + sum_q, sum_d = 0.0, 0.0 + for q in p: # A row from the transition matrix. + for d in p: # Another row from the transition matrix. + if not np.array_equal(q, d) and sum(q) and sum(d): + plus = [] # Indexes of the elements of q+ and d+. + for i in range(np.shape(p)[1]): # Iterate each column. + if q[i] > d[i]: + plus.append(i) + s_q, s_d = 0.0, 0.0 + update = True + while update: + update = False + for i in plus: + s_q += q[i] + s_d += d[i] + max_tmp = max_val(s_q, s_d, a) + for i in plus: + if safe_div(q[i], d[i]) <= max_tmp: + plus.remove(i) + update = True + res_tmp = math.log(max_val(s_q, s_d, a)) + if res < res_tmp: + res = res_tmp + sum_q, sum_d = s_q, s_d + return sum_q, sum_d + + +''' + Generate data. + + Parameters: + usrs - The number of users. + timestamps - The number of timestamps. + locs - The numner of locations. + Returns: + data - The generated data. +''' +def gen_data(usrs, timestamps, locs): + print('Generating data... ', end='', flush=True) + # Generate timestamps. + ts = [] + t_start = datetime.datetime.now() + for t in range(timestamps): + t_start += datetime.timedelta(minutes=t) + ts.append(str(t_start.strftime('%Y-%m-%d %H:%M'))) + # Randomly generate unique possible transitions. + trans = set() + for i in range(locs**2): + trans.add((str(random.randint(1, locs)), str(random.randint(1, locs)))) + # Generate the data. + data = [] + for u in range(1, usrs + 1): + # Choose a random starting location. + l_start = random.sample(trans, 1)[0][0] + for t in ts: + # Choose a random next location. + while True: + l_nxt = str(random.randint(1, locs)) + if (l_start, l_nxt) in trans: + data.append([str(u), str(t), str(l_nxt), str(l_nxt), str(l_nxt)]) + break + print('OK.', flush=True) + return data + + +''' + Generate a transition matrix. + + Parameters: + n - The dimension of the matrix. + s - The correlation degree of each row [0, 1]. + The lower its value, the lower the degree of + uniformity of each row. + Returns: + p_ - The transition matrix. +''' +def gen_trans_mt(n, s): + if DEBUG: + print('Generating transition matrix %dx%d with s = %.4f... ' %(n, n, s), end='', flush=True) + p = np.zeros((n, n), float) + np.fill_diagonal(p, 1.0) + p_ = np.zeros((n, n), float) + for j, p_j in enumerate(p[:, ]): + for k, p_jk in enumerate(p_j): + p_[j, k] = round((p[j, k] + s) / (sum(p_j) + len(p_j)*s), 4) + if DEBUG: + print('OK.', flush=True) + if DEBUG: + print(p_) + return p_ + + +''' + Get the transition matrix + + Parameters: + locs - A list of all the locations. + trans - A list of all transitions. + Returns: + p - A 2d dict {{locs}{locs}} containing the + corresponding location transition probabilities. +''' +def get_trans_mt(locs, trans): + if DEBUG: + print('Generating the transition matrix... ', end='', flush=True) + # Initialize the transition matrix. + p = collections.defaultdict(dict) + for i in locs: + for j in locs: + p[i][j] = 0.0 + # Populate the transition matrix. + for i in locs: + for j in locs: + # Count the transitions from i to j. + cnt = collections.Counter(trans)[(i, j)] + # Count the transitions from i to any location. + sum_cnts = len([tr for tr in trans if tr[0] == i]) + res = 0.0 + if cnt != 0 and sum_cnts != 0: + res = cnt/sum_cnts + # Update transition matrix. + p[i][j] = res + if DEBUG: + print('OK.', flush=True) + for i in locs: + print(p[i]) + return p + + +''' + Calculate the measure-theoretic (Kolmogorov-Sinai) entropy + of a transition matrix. + + Parameters: + mt - A 2d dict transition matrix. + Returns: + h - The Kolmogorov-Sinai entropy of the matrix. +''' +def get_entropy(mt): + if DEBUG: + print('Calculating the measure-theoretic entropy... ', end='', flush=True) + h = 0.0 + # Get the 2d array of the transition matrix. + a = get_2darray(mt) + # Check if the transition matrix is empty. + if np.allclose(a, np.zeros((len(a), len(a)), float)): + return h + # Find the stationary distribution (v), written as a row + # vector, that does not change under the application of + # the transition matrix. In other words, the left eigenvector + # associated with eigenvalue (w) 1. It is also called + # Perron-Frobenius eigenvector, since it is a stochastic + # matrix and the largest magnitude of the eigenvalues must + # be 1. We use the transpose of the transition matrix to get + # the left eigenvector. + w, v = eigs(a.T, k=1, which='LM') + # Normalize the eigenvector. The vector is a probability + # distribution, and therefore its sum must be 1. Thus, we use + # its absolute elements' values and its l1-norm to normalize it. + p = np.absolute(v[:,0].real) + p_norm = np.linalg.norm(p, ord=1) + if p_norm: + p = p/p_norm + # Calculate the entropy. + if np.isclose(w.real[0], 1, atol=1e-03): + for i in range(len(a)): + for j in range(len(a)): + # When the transition probability is 0 use the limiting value (0). + if not np.isclose(a[i][j], 0, atol=1e-03) and not np.isclose(p[i], 0, atol=1e-03): + # Binary logarithm measures entropy in bits. + h += -p[i]*a[i][j]*math.log2(a[i][j]) + if DEBUG: + print(h, flush=True) + return h + + +''' + Convert a 2d dict to a 2d array. + + Parameters: + mt - The 2d dict. + Returns: + p - The 2d numpy array. +''' +def get_2darray(mt): + if type(mt) == type(np.array([])): + return mt + p = np.zeros((len(mt), len(mt)), float) + for i, mt_i in enumerate(mt.items()): + p[i] = list(mt_i[1].values()) + return p + + +''' + Get a Laplace probability distribution. + + Parameters: + ts - The points of the distribution. + t - The location of the distribution. + sc - The scale of the distribution. + Returns: + The probability distribution. +''' +def get_laplace_pd(ts, t, sc): + x = np.arange(0, len(ts), 1) + loc = np.where(ts == t) + return laplace.pdf(x, loc=loc, scale=sc)[0] + + +''' + Get a Gaussian probability distribution. + + Parameters: + ts - The points of the distribution. + t - The location of the distribution. + sc - The scale of the distribution. + Returns: + The probability distribution. +''' +def get_norm_pd(ts, t, sc): + x = np.arange(0, len(ts), 1) + loc = np.where(ts == t) + return norm.pdf(x, loc=loc, scale=sc)[0] + + +''' + Get a sample from the time series. + + Parameters: + ts - An ndarray of the timestamps. + t - The current timestamp. + pd - The probability distribution. + ptn - The desired portion [0, 1] of the non-zero elements + of the probability distribution to be sampled. + Returns: + spl - An ndarray of the sampled timestamps. +''' +def get_sample(ts, t, pct, pd): + if DEBUG: + print('Sampling %.2f%% of %s at %s... ' %(pct*100, ts, t), end='', flush=True) + # Check that it is a valid timestamp. + if not t in ts: + print('%s is not in the time series.' %(t)) + exit() + # Remove the current timestamp from the possible sample. + i = np.where(ts == t) + ts = np.delete(ts, i) + pd = np.delete(pd, i) + # Make sure that the probabilities sum to 1. + pd_norm = pd/np.linalg.norm(pd, ord=1) + # Get the sorted sample. + spl = np.sort(np.random.choice(ts, size=int(np.count_nonzero(pd)*pct), replace=False, p=pd_norm)) + if DEBUG: + print(spl, flush=True) + return spl + + +''' + Calculate the backward/forward privacy loss at the current + timestamp. + + Parameters: + p - The transition matrix representing the backward/forward + temporal correlations. + a - The privacy loss of the previous/next timestamp. + e - The privacy budget for data publishing. + Returns: + The backward/forward privacy loss at the current + timestamp. +''' +def priv_l(p, a, e): + sum_q, sum_d = find_qd(p, a) + return math.log(max_val(sum_q, sum_d, a)) + e + + +''' + Calculate the backward/forward privacy loss at the current + timestamp using memoization. + + Parameters: + p - The transition matrix representing the backward/forward + temporal correlations. + a - The privacy loss of the previous/next timestamp. + e - The privacy budget for data publishing. + Returns: + The backward/forward privacy loss at the current + timestamp. +''' +def priv_l_m(p, a, e): + key = xxhash.xxh64(p).hexdigest() + str(a) + str(e) + global MEM, TOTAL, MISS + TOTAL += 1 + result = MEM.get(key) + if not result: + MISS += 1 + sum_q, sum_d = find_qd(p, a) + result = math.log(max_val(sum_q, sum_d, a)) + e + MEM[key] = result + return result + + +''' + Calculate the total backward privacy loss at every timestamp. + + Parameters: + p - The transition matrix representing the backward + temporal correlations. + a - The backward privacy loss of every release. + e - The privacy budget for data publishing. + t - The time limit. + Returns: + a - The backward privacy loss at every timestamp + due to the previous data releases. +''' +def bpl(p, a, e, t): + a[0] = e[0] + for i in range(1, t): + a[i] = priv_l(p, a[i - 1], e[i]) + return a + + +''' + Calculate the total backward privacy loss at the current + timestamp with memoization. + + Parameters: + p - The transition matrix representing the backward + temporal correlations. + a - The backward privacy loss of the current release + at all previous timestamps. + e - The privacy budget for data publishing. + t - The time limit. + Returns: + a - The backward privacy loss at the current timestamp + due to the previous data releases. +''' +def bpl_m(p, a, e, t): + a[0] = e[0] + for i in range(1, t): + a[i] = priv_l_m(p, a[i - 1], e[i]) + return a + +def bpl_lmdk_mem(p, a, e, t, lmdk): + # t is (near) the landmark + if lmdk == t - 1 or t == lmdk: + return np.array([0]*len(a)) + # There is no landmark before + if lmdk < 1: + lmdk = 1 + a[0] = e[0] + for i in range(lmdk, t): + a[i] = priv_l_m(p, a[i - 1], e[i]) + return a + + +''' + Calculate the total backward privacy loss at the current + timestamp using the static model, i.e., previous releases + are grouped in a window of static size. + + Parameters: + p - The transition matrix representing the backward + temporal correlations. + e - The privacy budget for data publishing. + i - The timestamp of interest. + w - The window size to group previous releases. + Returns: + a - The backward privacy loss at the current timestamp + due to the previous data releases. +''' +def bpl_s(p, e, i, w): + if i - w > 1: + # print('bpl_s: %d - %d [%d]' %(i, i - w, w)) + return priv_l(np.linalg.matrix_power(p, w), bpl_s(p, e, i - w, w), e[i - 1]) + elif i - w <= 1: + # print('bpl_s: %d - %d [%d]' %(i, 1, i - 1)) + return priv_l(np.linalg.matrix_power(p, i - 1), e[0], e[i - 1]) + elif i == 1: + # print('bpl_s: %d' %(i)) + return e[0] + + +''' + Calculate the total backward privacy loss at the current + timestamp using the static model, i.e., previous releases + are grouped in a window of static size, using memoization. + + Parameters: + p - The transition matrix representing the backward + temporal correlations. + e - The privacy budget for data publishing. + i - The timestamp of interest. + w - The window size to group previous releases. + Returns: + a - The backward privacy loss at the current timestamp + due to the previous data releases. +''' +def bpl_s_m(p, e, i, w): + if i - w > 1: + return priv_l_m(np.linalg.matrix_power(p, w), bpl_s_m(p, e, i - w, w), e[i - 1]) + elif i - w <= 1: + return priv_l_m(np.linalg.matrix_power(p, i - 1), e[0], e[i - 1]) + elif i == 1: + return e[0] + + +''' + Calculate the total backward privacy loss at the current + timestamp using the linear model, i.e., previous releases + are grouped in a window of a size that increases linearly. + + Parameters: + p - The transition matrix representing the backward + temporal correlations. + e - The privacy budget for data publishing. + i - The timestamp of interest. + w - The window size to group previous releases. + l - The linearly increasing coefficient that affects the + window size. + Returns: + a - The backward privacy loss at the current timestamp + due to the previous data releases. +''' +def bpl_l(p, e, i, w, l): + if i - w*l > 1: + # print('bpl_l: %d - %d [%d]' %(i, i - w*l, w*l)) + return priv_l(np.linalg.matrix_power(p, w*l), bpl_l(p, e, i - w*l, w, l + 1), e[i - 1]) + elif i - w*l <= 1: + # print('bpl_l: %d - %d [%d]' %(i, 1, i - 1)) + return priv_l(np.linalg.matrix_power(p, i - 1), e[0], e[i - 1]) + elif i == 1: + # print('bpl_l: %d' %(1)) + return e[0] + + +''' + Calculate the total backward privacy loss at the current + timestamp using the linear model, i.e., previous releases + are grouped in a window of a size that increases linearly, + using memoization. + + Parameters: + p - The transition matrix representing the backward + temporal correlations. + e - The privacy budget for data publishing. + i - The timestamp of interest. + w - The window size to group previous releases. + l - The linearly increasing coefficient that affects the + window size. + Returns: + a - The backward privacy loss at the current timestamp + due to the previous data releases. +''' +def bpl_l_m(p, e, i, w, l): + if i - w*l > 1: + return priv_l_m(np.linalg.matrix_power(p, w*l), bpl_l_m(p, e, i - w*l, w, l + 1), e[i - 1]) + elif i - w*l <= 1: + return priv_l_m(np.linalg.matrix_power(p, i - 1), e[0], e[i - 1]) + elif i == 1: + return e[0] + + +''' + Calculate the total backward privacy loss at the current + timestamp using the exponential model, i.e., previous releases + are grouped in a window of a size that increases exponentially. + + Parameters: + p - The transition matrix representing the backward + temporal correlations. + e - The privacy budget for data publishing. + i - The timestamp of interest. + w - The window size to group previous releases. + h - The exponentially increasing coefficient that affects + the window size. + Returns: + a - The backward privacy loss at the current timestamp + due to the previous data releases. +''' +def bpl_e(p, e, i, w, h): + if i - w**h > 1: + # print('bpl_e: %d - %d [%d]' %(i, i - w**h, w**h)) + return priv_l(np.linalg.matrix_power(p, w**h), bpl_e(p, e, i - w**h, w, h + 1), e[i - 1]) + elif i - w**h <= 1: + # print('bpl_e: %d - %d [%d]' %(i, 1, i - 1)) + return priv_l(np.linalg.matrix_power(p, i - 1), e[0], e[i - 1]) + elif i == 1: + # print('bpl_e: %d' %(1)) + return e[0] + + +''' + Calculate the total backward privacy loss at the current + timestamp using the exponential model, i.e., previous releases + are grouped in a window of a size that increases exponentially, + using memoization. + + Parameters: + p - The transition matrix representing the backward + temporal correlations. + e - The privacy budget for data publishing. + i - The timestamp of interest. + w - The window size to group previous releases. + h - The exponentially increasing coefficient that affects + the window size. + Returns: + a - The backward privacy loss at the current timestamp + due to the previous data releases. +''' +def bpl_e_m(p, e, i, w, h): + if i - w**h > 1: + return priv_l_m(np.linalg.matrix_power(p, w**h), bpl_e_m(p, e, i - w**h, w, h + 1), e[i - 1]) + elif i - w**h <= 1: + return priv_l_m(np.linalg.matrix_power(p, i - 1), e[0], e[i - 1]) + elif i == 1: + return e[0] + + +''' + Calculate the total forward privacy loss at the current + timestamp. + + Parameters: + p - The transition matrix representing the forward + temporal correlations. + a - The forward privacy loss of the current release + at all next timestamps. + e - The privacy budget for data publishing. + t - The time limit. + Returns: + a - The forward privacy loss at the current timestamp + due to the next data releases. +''' +def fpl(p, a, e, t): + a[t - 1] = e[t - 1] + for i in range(t - 2, -1, -1): + a[i] = priv_l(p, a[i + 1], e[i]) + return a + + +''' + Calculate the total forward privacy loss at the current + timestamp, using memoization. + + Parameters: + p - The transition matrix representing the forward + temporal correlations. + a - The forward privacy loss of the current release + at all next timestamps. + e - The privacy budget for data publishing. + t - The time limit. + Returns: + a - The forward privacy loss at the current timestamp + due to the next data releases. +''' +def fpl_m(p, a, e, t): + a[t - 1] = e[t - 1] + for i in range(t - 2, -1, -1): + a[i] = priv_l_m(p, a[i + 1], e[i]) + return a + +def fpl_lmdk_mem(p, a, e, t, lmdk): + # t is (near) the landmark + if lmdk == t + 1 or t == lmdk: + return np.array([0]*len(a)) + # There is no landmark after + if lmdk > len(e): + lmdk = len(e) + a[lmdk - 1] = e[lmdk - 1] + for i in range(lmdk - 2, t - 2, -1): + a[i] = priv_l_m(p, a[i + 1], e[i]) + return a + + +''' + Calculate the total forward privacy loss at the current + timestamp using the static model, i.e., next releases + are grouped in a window of static size. + + Parameters: + p - The transition matrix representing the forward + temporal correlations. + e - The privacy budget for data publishing. + i - The timestamp of interest. + w - The window size to group next releases. + Returns: + a - The forward privacy loss at the current timestamp + due to the next data releases. +''' +def fpl_s(p, e, i, t, w): + if i + w < t: + # print('fpl_s: %d - %d [%d]' %(i, i + w, w)) + return priv_l(np.linalg.matrix_power(p, w), fpl_s(p, e, i + w, t, w), e[i - 1]) + elif i + w >= t: + # print('fpl_s: %d - %d [%d]' %(i, t, t - i)) + return priv_l(np.linalg.matrix_power(p, t - i), e[t - 1], e[i - 1]) + elif i == t: + # print('fpl_s: %d' %(t)) + return e[t - 1] + + +''' + Calculate the total forward privacy loss at the current + timestamp using the static model, i.e., next releases + are grouped in a window of static size, using memoization. + + Parameters: + p - The transition matrix representing the forward + temporal correlations. + e - The privacy budget for data publishing. + i - The timestamp of interest. + w - The window size to group next releases. + Returns: + a - The forward privacy loss at the current timestamp + due to the next data releases. +''' +def fpl_s_m(p, e, i, t, w): + if i + w < t: + return priv_l_m(np.linalg.matrix_power(p, w), fpl_s_m(p, e, i + w, t, w), e[i - 1]) + elif i + w >= t: + return priv_l_m(np.linalg.matrix_power(p, t - i), e[t - 1], e[i - 1]) + elif i == t: + return e[t - 1] + + +''' + Calculate the total forward privacy loss at the current + timestamp using the linear model, i.e., next releases + are grouped in a window of a size that increases linearly. + + Parameters: + p - The transition matrix representing the forward + temporal correlations. + e - The privacy budget for data publishing. + i - The timestamp of interest. + w - The window size to group next releases. + l - The linearly increasing coefficient that affects the + window size. + Returns: + a - The forward privacy loss at the current timestamp + due to the next data releases. +''' +def fpl_l(p, e, i, t, w, l): + if i + w*l < t: + # print('fpl_l: %d - %d [%d]' %(i, i + w*l, w*l)) + return priv_l(np.linalg.matrix_power(p, w*l), fpl_l(p, e, i + w*l, t, w, l + 1), e[i - 1]) + elif i + w*l >= t: + # print('fpl_l: %d - %d [%d]' %(i, t, t - i)) + return priv_l(np.linalg.matrix_power(p, t - i), e[t - 1], e[i - 1]) + elif i == t: + # print('fpl_l: %d' %(t)) + return e[t - 1] + + +''' + Calculate the total forward privacy loss at the current + timestamp using the linear model, i.e., next releases + are grouped in a window of a size that increases linearly, + using memoization. + + Parameters: + p - The transition matrix representing the forward + temporal correlations. + e - The privacy budget for data publishing. + i - The timestamp of interest. + w - The window size to group next releases. + l - The linearly increasing coefficient that affects the + window size. + Returns: + a - The forward privacy loss at the current timestamp + due to the next data releases. +''' +def fpl_l_m(p, e, i, t, w, l): + if i + w*l < t: + return priv_l_m(np.linalg.matrix_power(p, w*l), fpl_l_m(p, e, i + w*l, t, w, l + 1), e[i - 1]) + elif i + w*l >= t: + return priv_l_m(np.linalg.matrix_power(p, t - i), e[t - 1], e[i - 1]) + elif i == t: + return e[t - 1] + + +''' + Calculate the total forward privacy loss at the current + timestamp using the exponential model, i.e., next releases + are grouped in a window of a size that increases exponentially. + + Parameters: + p - The transition matrix representing the forward + temporal correlations. + e - The privacy budget for data publishing. + i - The timestamp of interest. + w - The window size to group next releases. + h - The exponentially increasing coefficient that affects + the window size. + Returns: + a - The forward privacy loss at the current timestamp + due to the next data releases. +''' +def fpl_e(p, e, i, t, w, h): + if i + w**h < t: + # print('fpl_e: %d - %d [%d]' %(i, i + w**h, w**h)) + return priv_l(np.linalg.matrix_power(p, w**h), fpl_e(p, e, i + w**h, t, w, h + 1), e[i - 1]) + elif i + w**h >= t: + # print('fpl_e: %d - %d [%d]' %(i, t, t - i)) + return priv_l(np.linalg.matrix_power(p, t - i), e[t - 1], e[i - 1]) + elif i == t: + # print('fpl_e: %d' %(t)) + return e[t - 1] + + +''' + Calculate the total forward privacy loss at the current + timestamp using the exponential model, i.e., next releases + are grouped in a window of a size that increases exponentially, + using memoization. + + Parameters: + p - The transition matrix representing the forward + temporal correlations. + e - The privacy budget for data publishing. + i - The timestamp of interest. + w - The window size to group next releases. + h - The exponentially increasing coefficient that affects + the window size. + Returns: + a - The forward privacy loss at the current timestamp + due to the next data releases. +''' +def fpl_e_m(p, e, i, t, w, h): + if i + w**h < t: + return priv_l_m(np.linalg.matrix_power(p, w**h), fpl_e_m(p, e, i + w**h, t, w, h + 1), e[i - 1]) + elif i + w**h >= t: + return priv_l_m(np.linalg.matrix_power(p, t - i), e[t - 1], e[i - 1]) + elif i == t: + return e[t - 1] + + +''' + Calculate the total privacy loss at every timestamp. + + Parameters: + bpl - The backward privacy loss. + fpl - The forward privacy loss. + e - The privacy budget for data publishing. + Returns: + The list of total privacy loss at every timestamp. +''' +def tpl(bpl, fpl, e): + return [x + y - z for (x, y, z) in zip(bpl, fpl, e)] + + +''' + Calculate the temporal privacy loss at every timestamp + taking into account landmarks. + + Parameters: + e - The privacy budget for data publishing. + p_b - The transition matrix representing the backward + temporal correlations. + p_f - The transition matrix representing the forward + temporal correlations. + seq - The point sequence. + lmdks - The landmarks. + Returns: + a_b - The backward privacy loss at the current timestamp + due to the previous data releases. + a_f - The forward privacy loss at the current timestamp + due to the next data releases. + a - The total privacy loss at every timestamp + taking into account landmarks. +''' +def tpl_lmdk_mem(e, p_b, p_f, seq, lmdks): + a_b = np.zeros(len(seq)) + a_f = np.zeros(len(seq)) + a = np.zeros(len(seq)) + for t in seq: + t_prv, t_nxt = get_limits(t, seq, lmdks) + a_b[t - 1] = bpl_lmdk_mem(p_b, np.zeros(max(seq)), e, t, t_prv)[t - 1] + a_f[t - 1] = fpl_lmdk_mem(p_f, np.zeros(max(seq)), e, t, t_nxt)[t - 1] + if a_b[t - 1] > 0 and a_f[t - 1] > 0: + a[t - 1] = a_b[t - 1] + a_f[t - 1] - e[t - 1] + elif a_b[t - 1] > 0 or a_f[t - 1] > 0: + a[t - 1] = a_b[t - 1] + a_f[t - 1] + else: + a[t - 1] = e[t - 1] + return a_b, a_f, a + + +''' + Get the limits for the calculation of temporal privacy loss. + + Parameters: + t - The current timestamp. + seq - The point sequence. + lmdks - The landmarks. + Returns: + t_prv - The previous landmark. + t_nxt - The next landmark. +''' +def get_limits(t, seq, lmdks): + # Add landmark limits. + seq_lmdks = np.copy(lmdks) + # if seq[0] not in seq_lmdks: + # seq_lmdks = np.append(seq_lmdks, seq[0]) + # if seq[len(seq) - 1] not in seq_lmdks: + # seq_lmdks = np.append(seq_lmdks, seq[len(seq) - 1]) + # seq_lmdks = np.sort(seq_lmdks) + # Initialize previous and next landmarks. + t_prv = 0 + t_nxt = len(seq) + 1 + # Add current timestamp to landmarks. + seq_lmdks_tmp = np.copy(seq_lmdks) + if t not in seq_lmdks_tmp[:]: + seq_lmdks_tmp = np.append(seq_lmdks, t) + seq_lmdks_tmp = np.sort(seq_lmdks_tmp) + # Find the index of the current timestamp in the landmarks. + idx, = np.where(seq_lmdks_tmp == t) + i = idx[0] + # Find previous landmark. + if i != 0: + t_prv = seq_lmdks_tmp[i - 1] + # Find next landmark. + if i != len(seq_lmdks_tmp) - 1: + t_nxt = seq_lmdks_tmp[i + 1] + return t_prv, t_nxt + + +''' + Plots the privacy loss of the time series. + + Parameters: + title - The title of the plot. + e - The privacy budget for data publishing. + a_b - The backward privacy loss. + a_f - The forward privacy loss. + a - The total privacy loss. + Returns: + Nothing. +''' +def plot_loss(title, e, a_b, a_f, a): + plt.rc('font', family='serif') + plt.rc('font', size=10) + plt.rc('text', usetex=True) + p = np.arange(1.0, len(e) + 1, 1) + plt.plot(p, + e, + label=r'$\varepsilon$', + color='gray', + marker='s') + plt.plot(p, + a_b, + label=r'$\alpha^B$', + color='#e52e4e', # red + marker='<') + plt.plot(p, + a_f, + label=r'$\alpha^F$', + color='#009874', # green + marker='>') + plt.plot(p, + a, + label=r'$\alpha$', + color='#6082b6', # blue + marker='d') + plt.axis([1.0, len(e), + 0.0, 1.5*max(a)]) + plt.legend(loc='best', frameon=True) + plt.grid(axis='y', alpha=1.0) # Add grid on y axis. + plt.xlabel('time') # Set x axis label. + plt.ylabel('privacy loss') # Set y axis label. + plt.title(title) + plt.show() + + +''' + Plots a comparison of the privacy loss of all models. + + Parameters: + title - The title of the plot. + a - The privacy loss of the basic model. + a_s - The privacy loss of the static model. + a_e - The privacy loss of the exponential model. + a_l - The privacy loss of the linear model. + Returns: + Nothing. +''' +def cmp_loss(title, a, a_s, a_e, a_l): + plt.rc('font', family='serif') + plt.rc('font', size=10) + plt.rc('text', usetex=True) + p = np.arange(1.0, len(a) + 1, 1) + plt.plot(p, + a, + label=r'$\alpha$', + color='red', + marker='s') + plt.plot(p, + a_s, + label=r'$\alpha_s$', + color='green', + marker='>') + plt.plot(p, + a_e, + label=r'$\alpha_e$', + color='blue', + marker='d') + plt.plot(p, + a_l, + label=r'$\alpha_l$', + color='yellow', + marker='<') + plt.axis([1.0, len(a), + 0.0, 1.5*max(a)]) + plt.legend(loc='best', frameon=True) + plt.grid(axis='y', alpha=1.0) # Add grid on y axis. + plt.xlabel('time') # Set x axis label. + plt.ylabel('privacy loss') # Set y axis label. + plt.title(title) + plt.show() + + +''' + Parse arguments. + + Mandatory: + model, The model to be used: (b)asic, + (s)tatic, (l)inear, (e)xponential. + + Optional: + -c, --correlation, The correlation degree. + -d, --debug, Enable debugging mode. + -e, --epsilon, The available privacy budget. + -m, --matrix, The size of the transition matrix. + -o, --output, The path to the output directory. + -t, --time, The time limit. + -w, --window, The size of the event protection window. +''' +def parse_args(): + # Create argument parser. + parser = argparse.ArgumentParser() + + # Mandatory arguments. + parser.add_argument('model', help='The model to be used.', choices=['b', 's', 'l', 'e'], type=str) + + # Optional arguments. + parser.add_argument('-c', '--correlation', help='The correlation degree.', type=float, default=0.1) + parser.add_argument('-d', '--debug', help='Enable debugging mode.', action='store_true') + parser.add_argument('-e', '--epsilon', help='The available privacy budget.', type=float, default=1.0) + parser.add_argument('-m', '--matrix', help='The matrix size.', type=int, default=2) + parser.add_argument('-o', '--output', help='The path to the output directory.', type=str, default='') + parser.add_argument('-t', '--time', help='The time limit.', type=int, default=1) + parser.add_argument('-w', '--window', help='The size of the event protection window.', type=int, default=1) + + # Parse arguments. + args = parser.parse_args() + + return args + + +def main(args): + + global DEBUG, MISS, TOTAL + DEBUG = args.debug + MISS, TOTAL = 0, 0 + + e = [args.epsilon]*args.time + + # Generate transition matrices. + p_b = gen_trans_mt(args.matrix, args.correlation) + p_f = gen_trans_mt(args.matrix, args.correlation) + + if DEBUG: + # Run basic for comparison. + a_b = bpl(p_b, [0.0] * args.time, e, args.time) + a_f = fpl(p_f, [0.0] * args.time, e, args.time) + a = tpl(a_b, a_f, e) + + output_path = args.output +\ + '/' + args.model +\ + '_c' + str(args.correlation) +\ + '_e' + str(args.epsilon) +\ + '_m' + str(args.matrix) +\ + '_t' + str(args.time) +\ + '_w' + str(args.window) + + if args.model == 'b': + # Basic + + # start_time = time.process_time() + a_b = bpl_m(p_b, [0.0] * args.time, e, args.time) + a_f = fpl_m(p_f, [0.0] * args.time, e, args.time) + a = tpl(a_b, a_f, e) + # duration1 = time.process_time() - start_time + + if DEBUG: + print('\Basic\n' + 't\tbpl\t\tfpl\t\ttpl') + for i in range(0, args.time): + print('%d\t%f\t%f\t%f' %(i + 1, a_b[i], a_f[i], a[i])) + print('\nSUM\t%f\t%f\t%f' %(sum(a_b), sum(a_f), sum(a))) + print('AVG\t%f\t%f\t%f' %(sum(a_b)/args.time, sum(a_f)/args.time, sum(a)/args.time)) + plot_loss('Basic', e, a_b, a_f, a) + if args.Returns: + save_output(output_path, args.time, e, a_b, a_f, a) + + # start_time = time.process_time() + # a_b_m = bpl_m(p_b, [0.0] * args.time, e, args.time) + # a_f_m = fpl_m(p_f, [0.0] * args.time, e, args.time) + # a_m = tpl(a_b_m, a_f_m, e) + # duration2 = time.process_time() - start_time + + # diff = ((duration2 - duration1)/duration1)*100 + # success = safe_div((TOTAL - MISS)*100, TOTAL) + # print('##############################\n' + # 'basic : %.4fs\n' + # 'w/ mem: %.4fs\n' + # '- succ: %d/%d (%.2f%%)\n' + # 'diff : %.4fs (%.2f%%)\n' + # 'OK : %r' + # %(duration1, + # duration2, + # TOTAL - MISS, TOTAL, success, + # duration2 - duration1, diff, + # np.array_equal(a, a_m))) + # MISS, TOTAL = 0, 0 + elif args.model == 's': + # Static + + # start_time = time.process_time() + a_s_b, a_s_f = [None]*args.time, [None]*args.time + for i in range(1, args.time + 1): + a_s_b[i - 1] = bpl_s_m(p_b, e, i, args.window) + a_s_f[i - 1] = fpl_s_m(p_f, e, i, args.time, args.window) + a_s = tpl(a_s_b, a_s_f, e) + # duration1 = time.process_time() - start_time + + if DEBUG: + print('\nStatic\n' + 't\tbpl\t\t\tfpl\t\t\ttpl') + for i in range(0, args.time): + print('%d\t%f (%.2f%%)\t%f (%.2f%%)\t%f (%.2f%%)' + %(i + 1, + a_s_b[i], (a_s_b[i] - a_b[i])/a_b[i]*100, + a_s_f[i], (a_s_f[i] - a_f[i])/a_f[i]*100, + a_s[i], (a_s[i] - a[i])/a[i]*100)) + print('\nSUM\t%f (%.2f%%)\t%f (%.2f%%)\t%f (%.2f%%)' + %(sum(a_s_b), ((sum(a_s_b) - sum(a_b))/sum(a_b))*100, + sum(a_s_f), ((sum(a_s_f) - sum(a_f))/sum(a_f))*100, + sum(a_s), ((sum(a_s) - sum(a))/sum(a))*100)) + print('AVG\t%f (%.2f%%)\t%f (%.2f%%)\t%f (%.2f%%)' + %(sum(a_s_b)/args.time, ((sum(a_s_b)/args.time - sum(a_b)/args.time)/(sum(a_b)/args.time))*100, + sum(a_s_f)/args.time, ((sum(a_s_f)/args.time - sum(a_f)/args.time)/(sum(a_f)/args.time))*100, + sum(a_s)/args.time, ((sum(a_s)/args.time - sum(a)/args.time)/(sum(a)/args.time))*100)) + plot_loss('Static', e, a_s_b, a_s_f, a_s) + if args.Returns: + save_output(output_path, args.time, e, a_s_b, a_s_f, a_s) + + # start_time = time.process_time() + # a_s_b_m, a_s_f_m = [None]*args.time, [None]*args.time + # for i in range(1, args.time + 1): + # a_s_b_m[i - 1] = bpl_s_m(p_b, e, i, args.window) + # a_s_f_m[i - 1] = fpl_s_m(p_f, e, i, args.time, args.window) + # a_s_m = tpl(a_s_b_m, a_s_f_m, e) + # duration2 = time.process_time() - start_time + + # diff = ((duration2 - duration1)/duration1)*100 + # success = safe_div((TOTAL - MISS)*100, TOTAL) + # print('##############################\n' + # 'static: %.4fs\n' + # 'w/ mem: %.4fs\n' + # '- succ: %d/%d (%.2f%%)\n' + # 'diff : %.4fs (%.2f%%)\n' + # 'OK : %r' + # %(duration1, + # duration2, + # TOTAL - MISS, TOTAL, success, + # duration2 - duration1, diff, + # np.array_equal(a_s, a_s_m))) + # MISS, TOTAL = 0, 0 + elif args.model == 'l': + # Linear + l = 1 + + # start_time = time.process_time() + a_l_b, a_l_f = [None]*args.time, [None]*args.time + for i in range(1, args.time + 1): + a_l_b[i - 1] = bpl_l_m(p_b, e, i, args.window, l) + a_l_f[i - 1] = fpl_l_m(p_f, e, i, args.time, args.window, l) + a_l = tpl(a_l_b, a_l_f, e) + # duration1 = time.process_time() - start_time + + if DEBUG: + print('\nLinear\n' + 't\tbpl\t\t\tfpl\t\t\ttpl') + for i in range(0, args.time): + print('%d\t%f (%.2f%%)\t%f (%.2f%%)\t%f (%.2f%%)' + %(i + 1, + a_l_b[i], (a_l_b[i] - a_b[i])/a_b[i]*100, + a_l_f[i], (a_l_f[i] - a_f[i])/a_f[i]*100, + a_l[i], (a_l[i] - a[i])/a[i]*100)) + print('\nSUM\t%f (%.2f%%)\t%f (%.2f%%)\t%f (%.2f%%)' + %(sum(a_l_b), ((sum(a_l_b) - sum(a_b))/sum(a_b))*100, + sum(a_l_f), ((sum(a_l_f) - sum(a_f))/sum(a_f))*100, + sum(a_l), ((sum(a_l) - sum(a))/sum(a))*100)) + print('AVG\t%f (%.2f%%)\t%f (%.2f%%)\t%f (%.2f%%)' + %(sum(a_l_b)/args.time, ((sum(a_l_b)/args.time - sum(a_b)/args.time)/(sum(a_b)/args.time))*100, + sum(a_l_f)/args.time, ((sum(a_l_f)/args.time - sum(a_f)/args.time)/(sum(a_f)/args.time))*100, + sum(a_l)/args.time, ((sum(a_l)/args.time - sum(a)/args.time)/(sum(a)/args.time))*100)) + plot_loss('Linear', e, a_l_b, a_l_f, a_l) + if args.Returns: + save_output(output_path, args.time, e, a_l_b, a_l_f, a_l) + + # start_time = time.process_time() + # a_l_b_m, a_l_f_m = [None]*args.time, [None]*args.time + # for i in range(1, args.time + 1): + # a_l_b_m[i - 1] = bpl_l_m(p_b, e, i, args.window, l) + # a_l_f_m[i - 1] = fpl_l_m(p_f, e, i, args.time, args.window, l) + # a_l_m = tpl(a_l_b_m, a_l_f_m, e) + # duration2 = time.process_time() - start_time + + # diff = ((duration2 - duration1)/duration1)*100 + # success = safe_div((TOTAL - MISS)*100, TOTAL) + # print('##############################\n' + # 'linear: %.4fs\n' + # 'w/ mem: %.4fs\n' + # '- succ: %d/%d (%.2f%%)\n' + # 'diff : %.4fs (%.2f%%)\n' + # 'OK : %r' + # %(duration1, + # duration2, + # TOTAL - MISS, TOTAL, success, + # duration2 - duration1, diff, + # np.array_equal(a_l, a_l_m))) + # MISS, TOTAL = 0, 0 + elif args.model == 'e': + # Exponential + h = 1 + + # start_time = time.process_time() + a_e_b, a_e_f = [None]*args.time, [None]*args.time + for i in range(1, args.time + 1): + a_e_b[i - 1] = bpl_e_m(p_b, e, i, args.window, h) + a_e_f[i - 1] = fpl_e_m(p_f, e, i, args.time, args.window, h) + a_e = tpl(a_e_b, a_e_f, e) + # duration1 = time.process_time() - start_time + + if DEBUG: + print('\nExponential\n' + 't\tbpl\t\t\tfpl\t\t\ttpl') + for i in range(0, args.time): + print('%d\t%f (%.2f%%)\t%f (%.2f%%)\t%f (%.2f%%)' + %(i + 1, + a_e_b[i], (a_e_b[i] - a_b[i])/a_b[i]*100, + a_e_f[i], (a_e_f[i] - a_f[i])/a_f[i]*100, + a_e[i], (a_e[i] - a[i])/a[i]*100)) + print('\nSUM\t%f (%.2f%%)\t%f (%.2f%%)\t%f (%.2f%%)' + %(sum(a_e_b), ((sum(a_e_b) - sum(a_b))/sum(a_b))*100, + sum(a_e_f), ((sum(a_e_f) - sum(a_f))/sum(a_f))*100, + sum(a_e), ((sum(a_e) - sum(a))/sum(a))*100)) + print('AVG\t%f (%.2f%%)\t%f (%.2f%%)\t%f (%.2f%%)' + %(sum(a_e_b)/args.time, ((sum(a_e_b)/args.time - sum(a_b)/args.time)/(sum(a_b)/args.time))*100, + sum(a_e_f)/args.time, ((sum(a_e_f)/args.time - sum(a_f)/args.time)/(sum(a_f)/args.time))*100, + sum(a_e)/args.time, ((sum(a_e)/args.time - sum(a)/args.time)/(sum(a)/args.time))*100)) + plot_loss('Exponential', e, a_e_b, a_e_f, a_e) + if args.Returns: + save_output(output_path, args.time, e, a_e_b, a_e_f, a_e) + + # start_time = time.process_time() + # a_e_b_m, a_e_f_m = [None]*args.time, [None]*args.time + # for i in range(1, args.time + 1): + # a_e_b_m[i - 1] = bpl_e_m(p_b, e, i, args.window, h) + # a_e_f_m[i - 1] = fpl_e_m(p_f, e, i, args.time, args.window, h) + # a_e_m = tpl(a_e_b_m, a_e_f_m, e) + # duration2 = time.process_time() - start_time + + # diff = ((duration2 - duration1)/duration1)*100 + # success = safe_div((TOTAL - MISS)*100, TOTAL) + # print('##############################\n' + # 'exp : %.4fs\n' + # 'w/ mem: %.4fs\n' + # '- succ: %d/%d (%.2f%%)\n' + # 'diff : %.4fs (%.2f%%)\n' + # 'OK : %r' + # %(duration1, + # duration2, + # TOTAL - MISS, TOTAL, success, + # duration2 - duration1, diff, + # np.array_equal(a_e, a_e_m))) + # MISS, TOTAL = 0, 0 + + # if DEBUG: + # cmp_loss('Backward', a_b, a_s_b, a_l_b, a_e_b) + # cmp_loss('Forward', a_f, a_s_f, a_l_f, a_e_f) + # cmp_loss('Total', a, a_s, a_l, a_e) + + +if __name__ == '__main__': + try: + args = parse_args() + print('##############################\n' + 'Correlation: %f\n' + 'Debug : %r\n' + 'Epsilon : %f\n' + 'Matrix : %d\n' + 'Model : %s\n' + 'Output : %s\n' + 'Time : %d\n' + 'Window : %d\n' + '##############################' + %(args.correlation, + args.debug, + args.epsilon, + args.matrix, + args.model, + args.output, + args.time, + args.window), flush=True) + start_time = time.process_time() + main(args) + end_time = time.process_time() + print('##############################\n' + 'Time : %.4fs\n' + '##############################\n' + %(end_time - start_time), flush=True) + except KeyboardInterrupt: + print('Interrupted by user.') + exit() diff --git a/graphics/avg-dist.pdf b/graphics/evaluation/avg-dist.pdf similarity index 73% rename from graphics/avg-dist.pdf rename to graphics/evaluation/avg-dist.pdf index 8d7751b..6a4c9f0 100644 Binary files a/graphics/avg-dist.pdf and b/graphics/evaluation/avg-dist.pdf differ diff --git a/graphics/evaluation/dist-cor-mod.pdf b/graphics/evaluation/dist-cor-mod.pdf new file mode 100644 index 0000000..4f65201 Binary files /dev/null and b/graphics/evaluation/dist-cor-mod.pdf differ diff --git a/graphics/dist-cor-mod.pdf b/graphics/evaluation/dist-cor-stg.pdf similarity index 69% rename from graphics/dist-cor-mod.pdf rename to graphics/evaluation/dist-cor-stg.pdf index 275cee8..8df46a5 100644 Binary files a/graphics/dist-cor-mod.pdf and b/graphics/evaluation/dist-cor-stg.pdf differ diff --git a/graphics/dist-cor-wk.pdf b/graphics/evaluation/dist-cor-wk.pdf similarity index 68% rename from graphics/dist-cor-wk.pdf rename to graphics/evaluation/dist-cor-wk.pdf index 0ddc106..e098a93 100644 Binary files a/graphics/dist-cor-wk.pdf and b/graphics/evaluation/dist-cor-wk.pdf differ diff --git a/rslt/avg_dist/avg-dist.pdf b/rslt/avg_dist/avg-dist.pdf new file mode 100644 index 0000000..6a4c9f0 Binary files /dev/null and b/rslt/avg_dist/avg-dist.pdf differ diff --git a/rslt/dist_cor/dist-cor-mod.pdf b/rslt/dist_cor/dist-cor-mod.pdf new file mode 100644 index 0000000..4f65201 Binary files /dev/null and b/rslt/dist_cor/dist-cor-mod.pdf differ diff --git a/graphics/dist-cor-stg.pdf b/rslt/dist_cor/dist-cor-stg.pdf similarity index 68% rename from graphics/dist-cor-stg.pdf rename to rslt/dist_cor/dist-cor-stg.pdf index 9e90977..8df46a5 100644 Binary files a/graphics/dist-cor-stg.pdf and b/rslt/dist_cor/dist-cor-stg.pdf differ diff --git a/rslt/dist_cor/dist-cor-wk.pdf b/rslt/dist_cor/dist-cor-wk.pdf new file mode 100644 index 0000000..e098a93 Binary files /dev/null and b/rslt/dist_cor/dist-cor-wk.pdf differ diff --git a/text/evaluation/thething.tex b/text/evaluation/thething.tex index b68bad5..20aec7e 100644 --- a/text/evaluation/thething.tex +++ b/text/evaluation/thething.tex @@ -51,7 +51,8 @@ In general, we can claim that the Adaptive is the most reliable and best perform Moreover, designing a data-dependent sampling scheme would possibly result in better results for Adaptive. -\paragraph{Temporal distance and correlation} +\subsubsection{Temporal distance and correlation} + Figure~\ref{fig:avg-dist} shows a comparison of the average temporal distance of the events from the previous/next {\thething} or the start/end of the time series for various distributions in synthetic data. More particularly, we count for every event the total number of events between itself and the nearest {\thething} or the series edge. We observe that the uniform and bimodal distributions tend to limit the regular event--{\thething} distance. @@ -61,33 +62,33 @@ On the contrary, distributing the {\thethings} at one part of the sequence, as i \begin{figure}[htp] \centering - \includegraphics[width=.5\linewidth]{avg-dist}% + \includegraphics[width=.5\linewidth]{evaluation/avg-dist}% \caption{Average temporal distance of the events from the {\thethings} for different {\thethings} percentages within a time series in various {\thethings} distributions.} \label{fig:avg-dist} \end{figure} -Figure~\ref{fig:dist-cor} illustrates a comparison among the aforementioned distributions regarding the overall privacy loss under moderate (Figure~\ref{fig:dist-cor-mod}), and strong (Figure~\ref{fig:dist-cor-stg}) correlation degrees. +Figure~\ref{fig:dist-cor} illustrates a comparison among the aforementioned distributions regarding the overall privacy loss under (a)~weak, (b)~moderate, and (c)~strong temporal correlation degrees. The line shows the overall privacy loss---for all cases of {\thethings} distribution---without temporal correlation. -We skip the presentation of the results under a weak correlation degree, since they converge in this case. -In combination with Figure~\ref{fig:avg-dist}, we conclude that a greater average event-{\thething} distance in a distribution can result into greater overall privacy loss under moderate and strong temporal correlation. -This is due to the fact that the backward/forward privacy loss accumulates more over time in wider spaces without {\thethings} (see Section~\ref{subsec:correlations}). +In combination with Figure~\ref{fig:avg-dist}, we conclude that a greater average event--{\thething} distance in a distribution can result into greater overall privacy loss under moderate and strong temporal correlation. +This is due to the fact that the backward/forward privacy loss accumulates more over time in wider spaces without {\thethings} (see Section~\ref{sec:correlation}). Furthermore, the behavior of the privacy loss is as expected regarding the temporal correlation degree. Predictably, a stronger correlation degree generates higher privacy loss while widening the gap between the different distribution cases. On the contrary, a weaker correlation degree makes it harder to differentiate among the {\thethings} distributions. +The privacy loss under a weak correlation degree converge. \begin{figure}[htp] \centering \subcaptionbox{Weak correlation\label{fig:dist-cor-wk}}{% - \includegraphics[width=.5\linewidth]{dist-cor-wk}% + \includegraphics[width=.5\linewidth]{evaluation/dist-cor-wk}% }% \hspace{\fill} \subcaptionbox{Moderate correlation\label{fig:dist-cor-mod}}{% - \includegraphics[width=.5\linewidth]{dist-cor-mod}% + \includegraphics[width=.5\linewidth]{evaluation/dist-cor-mod}% }% \subcaptionbox{Strong correlation\label{fig:dist-cor-stg}}{% - \includegraphics[width=.5\linewidth]{dist-cor-stg}% + \includegraphics[width=.5\linewidth]{evaluation/dist-cor-stg}% }% - \caption{Privacy loss for different {\thethings} percentages and distributions, under weak, moderate, and strong degrees of temporal correlation. + \caption{Privacy loss for different {\thethings} percentages and distributions, under (a)~weak, (b)~moderate, and (c)~strong degrees of temporal correlation. The line shows the overall privacy loss without temporal correlation.} \label{fig:dist-cor} \end{figure}