#!/usr/bin/env python3 import argparse import collections import csv import datetime import math from matplotlib import pyplot as plt import numpy as np import os import random from scipy.sparse.linalg import eigs from scipy.stats import laplace from scipy.stats import norm import sympy as sp import time import xxhash # Debugging DEBUG = False # Variable to check. # When enabled print something. # Memoization MEM = {} # The cache. MISS = 0 # Number of additions to the cache. TOTAL = 0 # Number of cache accesses. def load_data(path): ''' Read data from a file. Parameters: path - The relative path to the data file. Returns: data - A list of tuples [uid, timestamp, lng, lat, loc]. ''' print('Loading data from', os.path.abspath(path), '... ', end='') data = [] try: with open(path, 'r') as f: reader = csv.reader(f, delimiter=',') data = list(reader) print('OK.') return data except IOError: print('Oops!') exit() def save_output(path, t, e, a_b, a_f, a): ''' Save output to a file. Parameters: path - The relative path to the output file. t - The number of timestamps. e - The privacy budget at each timestamp. a_b - The backward privacy loss at each timestamp. a_f - The forward privacy loss at each timestamp. a - The temporal privacy loss at each timestamp. Returns: Nothing. ''' # timestamp = time.strftime('%Y%m%d%H%M%S') print('Saving output to %s... ' %(path), end='', flush=True) os.makedirs(os.path.dirname(path), exist_ok=True) with open(path, 'w') as f: w = csv.writer(f, delimiter=';') w.writerow(['t', 'e', 'bpl', 'fpl', 'tpl']) w.writerows(zip(list(range(1, t + 1)), e, a_b, a_f, a)) print('OK.', flush=True) def get_timestamps(data): ''' Get all the timestamps from the input data. Parameters: data - The input data set. Returns: timestamps - An ndarray of all of the timestamps from the input data. ''' print('Getting a list of all timestamps... ', end='', flush=True) timestamps = np.sort(np.unique(np.array(data)[:, 1])) if not len(timestamps): print('No timestamps found.', flush=True) exit() # Check timestamps' order. prev = None for t in timestamps: curr = time.mktime(datetime.datetime.strptime(t, '%Y-%m-%d %H:%M').timetuple()) if prev: if curr < prev: print('Wrong order.', flush=True) exit() prev = curr print('OK.', flush=True) if DEBUG: print(timestamps, len(timestamps)) return timestamps def get_locs(data): ''' Get all the unique locations from the input data. Parameters: data - The input data set. Returns: locs - A sorted ndarray of all the unique locations int the input data. ''' print('Getting a list of all locations... ', end='', flush=True) locs = np.sort(np.unique(np.array(data)[:, 4].astype(np.int))) if not len(locs): print('No locations found.', flush=True) exit() print('OK.', flush=True) if DEBUG: print(list(map(str, locs)), len(locs)) return list(map(str, locs)) def get_cnts(data, t): ''' Get the counts at every location for a specific timestamp. Parameters: data - The input data set. t - The timestamp of interest. Returns: cnts - A dict {loc:cnt} with the counts at every location for a specific timestamp. ''' print('Getting all counts at %s... ' %(t), end='', flush=True) locs = get_locs(data) cnts = dict.fromkeys(locs, 0) for d in data: if d[1] == t: cnts[int(d[4])] += 1 print('OK.', flush=True) if DEBUG: print(cnts) return cnts def get_all_cnts(data): ''' Get the counts at every location for every timestamp. Parameters: data - The input data set. Returns: cnts - A dict {timestamp:loc} with all the counts at every location for every timestamp. ''' cnts = {} for d in data: key = d[1] + '@' + d[4] cnts[key] = cnts.get(key, 0) + 1 if DEBUG: print(cnts) return cnts def get_usrs(data): ''' Get a list of unique users in the input data set. Parameters: data - The input data set. Returns: users - An ndarray of all unique users. ''' users = np.sort(np.unique(np.array(data)[:, 0].astype(np.int))) if not len(users): print('No users found.') exit() if DEBUG: print(users, len(users)) return users def get_usr_data(data, id): ''' Get the data of a particular user from a data set. Parameters: data - The input data set. id - The user identifier. Returns: output - A list of the data of the targeted user. ''' output = [] for d in data: if (d[0] == str(id)): output.append(d) if DEBUG: print(id, output) return output def get_usrs_data(data): ''' Get the data of every user in a data set. Parameters: data - The input data set. Returns: output - A dict {usr, [usr_data]} with the data of each user. ''' output = {} for d in data: output[d[0]] = output.get(d[0], []) + [d] return output def get_usr_traj(data): ''' Get the trajectory of a user from her data. Parameters: data - The data of the user. Returns: traj - A list [(timestamp, loc)] with the locations and corresponding timestamps that the user was at. ''' traj = [] for d in data: traj.append((d[1], d[4])) if DEBUG: print(traj) return traj def get_poss_trans(data): ''' Get all the possible transitions. Parameters: data - The input data set. Returns: trans - A set with all the possible forward transitions in the input. ''' print('Getting possible transitions... ', end='', flush=True) trans = set() for u, u_data in data.items(): traj = get_usr_traj(u_data) trans.update(list(zip(traj, traj[1:]))) if len(trans) == 0: print('None.', flush=True) exit() print('OK.', flush=True) return trans def get_bwd_trans(data): ''' Get all backward transitions in a data set. Parameters: data - The input data set. Returns: trans - A dict {(t, t-1):[transitions]} with all the backward transitions at every sequential timestamp pair in the input data set. ''' print('Getting all backward transitions... ', end='', flush=True) trans = {} for u, u_data in data.items(): traj = get_usr_traj(u_data) for i, j in zip(traj[1:], traj): trans[(i[0], j[0])] = trans.get((i[0], j[0]), []) + [(i[1], j[1])] if len(trans) == 0: print('None.', flush=True) exit() print('OK.', flush=True) return trans def get_fwd_trans(data): ''' Get all forward transitions in a data set. Parameters: data - The input data set. Returns: trans - A dict {(t-1, t):[transitions]} with all the forward transitions at every sequential timestamp pair in the input data set. ''' print('Getting all forward transitions... ', end='', flush=True) trans = {} for u, u_data in data.items(): traj = get_usr_traj(u_data) for i, j in zip(traj, traj[1:]): trans[(i[0], j[0])] = trans.get((i[0], j[0]), []) + [(i[1], j[1])] if len(trans) == 0: print('None.', flush=True) exit() print('OK.', flush=True) return trans def safe_div(a, b): ''' Divide two numbers. If the divisor is 0 return inf. Parameters: a - The dividend. b - The divisor. Returns: The float result of the division. ''' if b == 0: return math.inf return float(a/b) def max_val(q, d, a): ''' Calculate the maximum value of the objective function. Parameters: q - A row from the transition matrix. d - Another row from the transition matrix. a - The backward/forward privacy loss of the previous/next timestamp. Returns: The maximum value of the objective function. ''' if a == math.inf: return math.nan return (q*(math.exp(a) - 1) + 1)/(d*(math.exp(a) - 1) + 1) def find_qd(p, a): ''' Find two different rows (q and d) of a transition matrix (p) that maximize the product of the objective function and return their sums. Parameters: p - The transition matrix representing the backward/forward correlations. a - The backward/forward privacy loss of the previous/next timestamp. Returns: sum_q - The sum of the elements of q. sum_d - The sum of the elements of d. ''' res = 0.0 sum_q, sum_d = 0.0, 0.0 for q in p: # A row from the transition matrix. for d in p: # Another row from the transition matrix. if not np.array_equal(q, d) and sum(q) and sum(d): plus = [] # Indexes of the elements of q+ and d+. for i in range(np.shape(p)[1]): # Iterate each column. if q[i] > d[i]: plus.append(i) s_q, s_d = 0.0, 0.0 update = True while update: update = False for i in plus: s_q += q[i] s_d += d[i] max_tmp = max_val(s_q, s_d, a) for i in plus: if safe_div(q[i], d[i]) <= max_tmp: plus.remove(i) update = True res_tmp = math.log(max_val(s_q, s_d, a)) if res < res_tmp: res = res_tmp sum_q, sum_d = s_q, s_d return sum_q, sum_d def gen_data(usrs, timestamps, locs): ''' Generate data. Parameters: usrs - The number of users. timestamps - The number of timestamps. locs - The numner of locations. Returns: data - The generated data. ''' print('Generating data... ', end='', flush=True) # Generate timestamps. ts = [] t_start = datetime.datetime.now() for t in range(timestamps): t_start += datetime.timedelta(minutes=t) ts.append(str(t_start.strftime('%Y-%m-%d %H:%M'))) # Randomly generate unique possible transitions. trans = set() for i in range(locs**2): trans.add((str(random.randint(1, locs)), str(random.randint(1, locs)))) # Generate the data. data = [] for u in range(1, usrs + 1): # Choose a random starting location. l_start = random.sample(trans, 1)[0][0] for t in ts: # Choose a random next location. while True: l_nxt = str(random.randint(1, locs)) if (l_start, l_nxt) in trans: data.append([str(u), str(t), str(l_nxt), str(l_nxt), str(l_nxt)]) break print('OK.', flush=True) return data def gen_trans_mt(n, s): ''' Generate a transition matrix. Parameters: n - The dimension of the matrix. s - The correlation degree of each row [0, 1]. The lower its value, the lower the degree of uniformity of each row. Returns: p_ - The transition matrix. ''' if DEBUG: print('Generating transition matrix %dx%d with s = %.4f... ' %(n, n, s), end='', flush=True) p = np.zeros((n, n), float) np.fill_diagonal(p, 1.0) p_ = np.zeros((n, n), float) for j, p_j in enumerate(p[:, ]): for k, p_jk in enumerate(p_j): p_[j, k] = round((p[j, k] + s) / (sum(p_j) + len(p_j)*s), 4) if DEBUG: print('OK.', flush=True) if DEBUG: print(p_) return p_ def get_trans_mt(locs, trans): ''' Get the transition matrix Parameters: locs - A list of all the locations. trans - A list of all transitions. Returns: p - A 2d dict {{locs}{locs}} containing the corresponding location transition probabilities. ''' if DEBUG: print('Generating the transition matrix... ', end='', flush=True) # Initialize the transition matrix. p = collections.defaultdict(dict) for i in locs: for j in locs: p[i][j] = 0.0 # Populate the transition matrix. for i in locs: for j in locs: # Count the transitions from i to j. cnt = collections.Counter(trans)[(i, j)] # Count the transitions from i to any location. sum_cnts = len([tr for tr in trans if tr[0] == i]) res = 0.0 if cnt != 0 and sum_cnts != 0: res = cnt/sum_cnts # Update transition matrix. p[i][j] = res if DEBUG: print('OK.', flush=True) for i in locs: print(p[i]) return p def get_entropy(mt): ''' Calculate the measure-theoretic (Kolmogorov-Sinai) entropy of a transition matrix. Parameters: mt - A 2d dict transition matrix. Returns: h - The Kolmogorov-Sinai entropy of the matrix. ''' if DEBUG: print('Calculating the measure-theoretic entropy... ', end='', flush=True) h = 0.0 # Get the 2d array of the transition matrix. a = get_2darray(mt) # Check if the transition matrix is empty. if np.allclose(a, np.zeros((len(a), len(a)), float)): return h # Find the stationary distribution (v), written as a row # vector, that does not change under the application of # the transition matrix. In other words, the left eigenvector # associated with eigenvalue (w) 1. It is also called # Perron-Frobenius eigenvector, since it is a stochastic # matrix and the largest magnitude of the eigenvalues must # be 1. We use the transpose of the transition matrix to get # the left eigenvector. w, v = eigs(a.T, k=1, which='LM') # Normalize the eigenvector. The vector is a probability # distribution, and therefore its sum must be 1. Thus, we use # its absolute elements' values and its l1-norm to normalize it. p = np.absolute(v[:,0].real) p_norm = np.linalg.norm(p, ord=1) if p_norm: p = p/p_norm # Calculate the entropy. if np.isclose(w.real[0], 1, atol=1e-03): for i in range(len(a)): for j in range(len(a)): # When the transition probability is 0 use the limiting value (0). if not np.isclose(a[i][j], 0, atol=1e-03) and not np.isclose(p[i], 0, atol=1e-03): # Binary logarithm measures entropy in bits. h += -p[i]*a[i][j]*math.log2(a[i][j]) if DEBUG: print(h, flush=True) return h def get_2darray(mt): ''' Convert a 2d dict to a 2d array. Parameters: mt - The 2d dict. Returns: p - The 2d numpy array. ''' if type(mt) == type(np.array([])): return mt p = np.zeros((len(mt), len(mt)), float) for i, mt_i in enumerate(mt.items()): p[i] = list(mt_i[1].values()) return p def get_laplace_pd(ts, t, sc): ''' Get a Laplace probability distribution. Parameters: ts - The points of the distribution. t - The location of the distribution. sc - The scale of the distribution. Returns: The probability distribution. ''' x = np.arange(0, len(ts), 1) loc = np.where(ts == t) return laplace.pdf(x, loc=loc, scale=sc)[0] def get_norm_pd(ts, t, sc): ''' Get a Gaussian probability distribution. Parameters: ts - The points of the distribution. t - The location of the distribution. sc - The scale of the distribution. Returns: The probability distribution. ''' x = np.arange(0, len(ts), 1) loc = np.where(ts == t) return norm.pdf(x, loc=loc, scale=sc)[0] def get_sample(ts, t, pct, pd): ''' Get a sample from the time series. Parameters: ts - An ndarray of the timestamps. t - The current timestamp. pd - The probability distribution. ptn - The desired portion [0, 1] of the non-zero elements of the probability distribution to be sampled. Returns: spl - An ndarray of the sampled timestamps. ''' if DEBUG: print('Sampling %.2f%% of %s at %s... ' %(pct*100, ts, t), end='', flush=True) # Check that it is a valid timestamp. if not t in ts: print('%s is not in the time series.' %(t)) exit() # Remove the current timestamp from the possible sample. i = np.where(ts == t) ts = np.delete(ts, i) pd = np.delete(pd, i) # Make sure that the probabilities sum to 1. pd_norm = pd/np.linalg.norm(pd, ord=1) # Get the sorted sample. spl = np.sort(np.random.choice(ts, size=int(np.count_nonzero(pd)*pct), replace=False, p=pd_norm)) if DEBUG: print(spl, flush=True) return spl def priv_l(p, a, e): ''' Calculate the backward/forward privacy loss at the current timestamp. Parameters: p - The transition matrix representing the backward/forward temporal correlations. a - The privacy loss of the previous/next timestamp. e - The privacy budget for data publishing. Returns: The backward/forward privacy loss at the current timestamp. ''' sum_q, sum_d = find_qd(p, a) return math.log(max_val(sum_q, sum_d, a)) + e def priv_l_m(p, a, e): ''' Calculate the backward/forward privacy loss at the current timestamp using memoization. Parameters: p - The transition matrix representing the backward/forward temporal correlations. a - The privacy loss of the previous/next timestamp. e - The privacy budget for data publishing. Returns: The backward/forward privacy loss at the current timestamp. ''' key = xxhash.xxh64(p).hexdigest() + str(a) + str(e) global MEM, TOTAL, MISS TOTAL += 1 result = MEM.get(key) if not result: MISS += 1 sum_q, sum_d = find_qd(p, a) result = math.log(max_val(sum_q, sum_d, a)) + e MEM[key] = result return result def bpl(p, a, e, t): ''' Calculate the total backward privacy loss at every timestamp. Parameters: p - The transition matrix representing the backward temporal correlations. a - The backward privacy loss of every release. e - The privacy budget for data publishing. t - The time limit. Returns: a - The backward privacy loss at every timestamp due to the previous data releases. ''' a[0] = e[0] for i in range(1, t): a[i] = priv_l(p, a[i - 1], e[i]) return a def bpl_m(p, a, e, t): ''' Calculate the total backward privacy loss at the current timestamp with memoization. Parameters: p - The transition matrix representing the backward temporal correlations. a - The backward privacy loss of the current release at all previous timestamps. e - The privacy budget for data publishing. t - The time limit. Returns: a - The backward privacy loss at the current timestamp due to the previous data releases. ''' a[0] = e[0] for i in range(1, t): a[i] = priv_l_m(p, a[i - 1], e[i]) return a def bpl_lmdk_mem(p, a, e, t, lmdk): # t is (near) the landmark if lmdk == t - 1 or t == lmdk: return np.array([0]*len(a)) # There is no landmark before if lmdk < 1: lmdk = 1 a[0] = e[0] for i in range(lmdk, t): a[i] = priv_l_m(p, a[i - 1], e[i]) return a def bpl_s(p, e, i, w): ''' Calculate the total backward privacy loss at the current timestamp using the static model, i.e., previous releases are grouped in a window of static size. Parameters: p - The transition matrix representing the backward temporal correlations. e - The privacy budget for data publishing. i - The timestamp of interest. w - The window size to group previous releases. Returns: a - The backward privacy loss at the current timestamp due to the previous data releases. ''' if i - w > 1: # print('bpl_s: %d - %d [%d]' %(i, i - w, w)) return priv_l(np.linalg.matrix_power(p, w), bpl_s(p, e, i - w, w), e[i - 1]) elif i - w <= 1: # print('bpl_s: %d - %d [%d]' %(i, 1, i - 1)) return priv_l(np.linalg.matrix_power(p, i - 1), e[0], e[i - 1]) elif i == 1: # print('bpl_s: %d' %(i)) return e[0] def bpl_s_m(p, e, i, w): ''' Calculate the total backward privacy loss at the current timestamp using the static model, i.e., previous releases are grouped in a window of static size, using memoization. Parameters: p - The transition matrix representing the backward temporal correlations. e - The privacy budget for data publishing. i - The timestamp of interest. w - The window size to group previous releases. Returns: a - The backward privacy loss at the current timestamp due to the previous data releases. ''' if i - w > 1: return priv_l_m(np.linalg.matrix_power(p, w), bpl_s_m(p, e, i - w, w), e[i - 1]) elif i - w <= 1: return priv_l_m(np.linalg.matrix_power(p, i - 1), e[0], e[i - 1]) elif i == 1: return e[0] def bpl_l(p, e, i, w, l): ''' Calculate the total backward privacy loss at the current timestamp using the linear model, i.e., previous releases are grouped in a window of a size that increases linearly. Parameters: p - The transition matrix representing the backward temporal correlations. e - The privacy budget for data publishing. i - The timestamp of interest. w - The window size to group previous releases. l - The linearly increasing coefficient that affects the window size. Returns: a - The backward privacy loss at the current timestamp due to the previous data releases. ''' if i - w*l > 1: # print('bpl_l: %d - %d [%d]' %(i, i - w*l, w*l)) return priv_l(np.linalg.matrix_power(p, w*l), bpl_l(p, e, i - w*l, w, l + 1), e[i - 1]) elif i - w*l <= 1: # print('bpl_l: %d - %d [%d]' %(i, 1, i - 1)) return priv_l(np.linalg.matrix_power(p, i - 1), e[0], e[i - 1]) elif i == 1: # print('bpl_l: %d' %(1)) return e[0] def bpl_l_m(p, e, i, w, l): ''' Calculate the total backward privacy loss at the current timestamp using the linear model, i.e., previous releases are grouped in a window of a size that increases linearly, using memoization. Parameters: p - The transition matrix representing the backward temporal correlations. e - The privacy budget for data publishing. i - The timestamp of interest. w - The window size to group previous releases. l - The linearly increasing coefficient that affects the window size. Returns: a - The backward privacy loss at the current timestamp due to the previous data releases. ''' if i - w*l > 1: return priv_l_m(np.linalg.matrix_power(p, w*l), bpl_l_m(p, e, i - w*l, w, l + 1), e[i - 1]) elif i - w*l <= 1: return priv_l_m(np.linalg.matrix_power(p, i - 1), e[0], e[i - 1]) elif i == 1: return e[0] def bpl_e(p, e, i, w, h): ''' Calculate the total backward privacy loss at the current timestamp using the exponential model, i.e., previous releases are grouped in a window of a size that increases exponentially. Parameters: p - The transition matrix representing the backward temporal correlations. e - The privacy budget for data publishing. i - The timestamp of interest. w - The window size to group previous releases. h - The exponentially increasing coefficient that affects the window size. Returns: a - The backward privacy loss at the current timestamp due to the previous data releases. ''' if i - w**h > 1: # print('bpl_e: %d - %d [%d]' %(i, i - w**h, w**h)) return priv_l(np.linalg.matrix_power(p, w**h), bpl_e(p, e, i - w**h, w, h + 1), e[i - 1]) elif i - w**h <= 1: # print('bpl_e: %d - %d [%d]' %(i, 1, i - 1)) return priv_l(np.linalg.matrix_power(p, i - 1), e[0], e[i - 1]) elif i == 1: # print('bpl_e: %d' %(1)) return e[0] def bpl_e_m(p, e, i, w, h): ''' Calculate the total backward privacy loss at the current timestamp using the exponential model, i.e., previous releases are grouped in a window of a size that increases exponentially, using memoization. Parameters: p - The transition matrix representing the backward temporal correlations. e - The privacy budget for data publishing. i - The timestamp of interest. w - The window size to group previous releases. h - The exponentially increasing coefficient that affects the window size. Returns: a - The backward privacy loss at the current timestamp due to the previous data releases. ''' if i - w**h > 1: return priv_l_m(np.linalg.matrix_power(p, w**h), bpl_e_m(p, e, i - w**h, w, h + 1), e[i - 1]) elif i - w**h <= 1: return priv_l_m(np.linalg.matrix_power(p, i - 1), e[0], e[i - 1]) elif i == 1: return e[0] def fpl(p, a, e, t): ''' Calculate the total forward privacy loss at the current timestamp. Parameters: p - The transition matrix representing the forward temporal correlations. a - The forward privacy loss of the current release at all next timestamps. e - The privacy budget for data publishing. t - The time limit. Returns: a - The forward privacy loss at the current timestamp due to the next data releases. ''' a[t - 1] = e[t - 1] for i in range(t - 2, -1, -1): a[i] = priv_l(p, a[i + 1], e[i]) return a def fpl_m(p, a, e, t): ''' Calculate the total forward privacy loss at the current timestamp, using memoization. Parameters: p - The transition matrix representing the forward temporal correlations. a - The forward privacy loss of the current release at all next timestamps. e - The privacy budget for data publishing. t - The time limit. Returns: a - The forward privacy loss at the current timestamp due to the next data releases. ''' a[t - 1] = e[t - 1] for i in range(t - 2, -1, -1): a[i] = priv_l_m(p, a[i + 1], e[i]) return a def fpl_lmdk_mem(p, a, e, t, lmdk): # t is (near) the landmark if lmdk == t + 1 or t == lmdk: return np.array([0]*len(a)) # There is no landmark after if lmdk > len(e): lmdk = len(e) a[lmdk - 1] = e[lmdk - 1] for i in range(lmdk - 2, t - 2, -1): a[i] = priv_l_m(p, a[i + 1], e[i]) return a def fpl_s(p, e, i, t, w): ''' Calculate the total forward privacy loss at the current timestamp using the static model, i.e., next releases are grouped in a window of static size. Parameters: p - The transition matrix representing the forward temporal correlations. e - The privacy budget for data publishing. i - The timestamp of interest. w - The window size to group next releases. Returns: a - The forward privacy loss at the current timestamp due to the next data releases. ''' if i + w < t: # print('fpl_s: %d - %d [%d]' %(i, i + w, w)) return priv_l(np.linalg.matrix_power(p, w), fpl_s(p, e, i + w, t, w), e[i - 1]) elif i + w >= t: # print('fpl_s: %d - %d [%d]' %(i, t, t - i)) return priv_l(np.linalg.matrix_power(p, t - i), e[t - 1], e[i - 1]) elif i == t: # print('fpl_s: %d' %(t)) return e[t - 1] def fpl_s_m(p, e, i, t, w): ''' Calculate the total forward privacy loss at the current timestamp using the static model, i.e., next releases are grouped in a window of static size, using memoization. Parameters: p - The transition matrix representing the forward temporal correlations. e - The privacy budget for data publishing. i - The timestamp of interest. w - The window size to group next releases. Returns: a - The forward privacy loss at the current timestamp due to the next data releases. ''' if i + w < t: return priv_l_m(np.linalg.matrix_power(p, w), fpl_s_m(p, e, i + w, t, w), e[i - 1]) elif i + w >= t: return priv_l_m(np.linalg.matrix_power(p, t - i), e[t - 1], e[i - 1]) elif i == t: return e[t - 1] def fpl_l(p, e, i, t, w, l): ''' Calculate the total forward privacy loss at the current timestamp using the linear model, i.e., next releases are grouped in a window of a size that increases linearly. Parameters: p - The transition matrix representing the forward temporal correlations. e - The privacy budget for data publishing. i - The timestamp of interest. w - The window size to group next releases. l - The linearly increasing coefficient that affects the window size. Returns: a - The forward privacy loss at the current timestamp due to the next data releases. ''' if i + w*l < t: # print('fpl_l: %d - %d [%d]' %(i, i + w*l, w*l)) return priv_l(np.linalg.matrix_power(p, w*l), fpl_l(p, e, i + w*l, t, w, l + 1), e[i - 1]) elif i + w*l >= t: # print('fpl_l: %d - %d [%d]' %(i, t, t - i)) return priv_l(np.linalg.matrix_power(p, t - i), e[t - 1], e[i - 1]) elif i == t: # print('fpl_l: %d' %(t)) return e[t - 1] def fpl_l_m(p, e, i, t, w, l): ''' Calculate the total forward privacy loss at the current timestamp using the linear model, i.e., next releases are grouped in a window of a size that increases linearly, using memoization. Parameters: p - The transition matrix representing the forward temporal correlations. e - The privacy budget for data publishing. i - The timestamp of interest. w - The window size to group next releases. l - The linearly increasing coefficient that affects the window size. Returns: a - The forward privacy loss at the current timestamp due to the next data releases. ''' if i + w*l < t: return priv_l_m(np.linalg.matrix_power(p, w*l), fpl_l_m(p, e, i + w*l, t, w, l + 1), e[i - 1]) elif i + w*l >= t: return priv_l_m(np.linalg.matrix_power(p, t - i), e[t - 1], e[i - 1]) elif i == t: return e[t - 1] def fpl_e(p, e, i, t, w, h): ''' Calculate the total forward privacy loss at the current timestamp using the exponential model, i.e., next releases are grouped in a window of a size that increases exponentially. Parameters: p - The transition matrix representing the forward temporal correlations. e - The privacy budget for data publishing. i - The timestamp of interest. w - The window size to group next releases. h - The exponentially increasing coefficient that affects the window size. Returns: a - The forward privacy loss at the current timestamp due to the next data releases. ''' if i + w**h < t: # print('fpl_e: %d - %d [%d]' %(i, i + w**h, w**h)) return priv_l(np.linalg.matrix_power(p, w**h), fpl_e(p, e, i + w**h, t, w, h + 1), e[i - 1]) elif i + w**h >= t: # print('fpl_e: %d - %d [%d]' %(i, t, t - i)) return priv_l(np.linalg.matrix_power(p, t - i), e[t - 1], e[i - 1]) elif i == t: # print('fpl_e: %d' %(t)) return e[t - 1] def fpl_e_m(p, e, i, t, w, h): ''' Calculate the total forward privacy loss at the current timestamp using the exponential model, i.e., next releases are grouped in a window of a size that increases exponentially, using memoization. Parameters: p - The transition matrix representing the forward temporal correlations. e - The privacy budget for data publishing. i - The timestamp of interest. w - The window size to group next releases. h - The exponentially increasing coefficient that affects the window size. Returns: a - The forward privacy loss at the current timestamp due to the next data releases. ''' if i + w**h < t: return priv_l_m(np.linalg.matrix_power(p, w**h), fpl_e_m(p, e, i + w**h, t, w, h + 1), e[i - 1]) elif i + w**h >= t: return priv_l_m(np.linalg.matrix_power(p, t - i), e[t - 1], e[i - 1]) elif i == t: return e[t - 1] def tpl(bpl, fpl, e): ''' Calculate the total privacy loss at every timestamp. Parameters: bpl - The backward privacy loss. fpl - The forward privacy loss. e - The privacy budget for data publishing. Returns: The list of total privacy loss at every timestamp. ''' return [x + y - z for (x, y, z) in zip(bpl, fpl, e)] def tpl_lmdk_mem(e, p_b, p_f, seq, lmdks): ''' Calculate the temporal privacy loss at every timestamp taking into account landmarks. Parameters: e - The privacy budget for data publishing. p_b - The transition matrix representing the backward temporal correlations. p_f - The transition matrix representing the forward temporal correlations. seq - The point sequence. lmdks - The landmarks. Returns: a_b - The backward privacy loss at the current timestamp due to the previous data releases. a_f - The forward privacy loss at the current timestamp due to the next data releases. a - The total privacy loss at every timestamp taking into account landmarks. ''' a_b = np.zeros(len(seq)) a_f = np.zeros(len(seq)) a = np.zeros(len(seq)) for t in seq: t_prv, t_nxt = get_limits(t, seq, lmdks) a_b[t - 1] = bpl_lmdk_mem(p_b, np.zeros(max(seq)), e, t, t_prv)[t - 1] a_f[t - 1] = fpl_lmdk_mem(p_f, np.zeros(max(seq)), e, t, t_nxt)[t - 1] if a_b[t - 1] > 0 and a_f[t - 1] > 0: a[t - 1] = a_b[t - 1] + a_f[t - 1] - e[t - 1] elif a_b[t - 1] > 0 or a_f[t - 1] > 0: a[t - 1] = a_b[t - 1] + a_f[t - 1] else: a[t - 1] = e[t - 1] return a_b, a_f, a def get_limits(t, seq, lmdks): ''' Get the limits for the calculation of temporal privacy loss. Parameters: t - The current timestamp. seq - The point sequence. lmdks - The landmarks. Returns: t_prv - The previous landmark. t_nxt - The next landmark. ''' # Add landmark limits. seq_lmdks = np.copy(lmdks) # if seq[0] not in seq_lmdks: # seq_lmdks = np.append(seq_lmdks, seq[0]) # if seq[len(seq) - 1] not in seq_lmdks: # seq_lmdks = np.append(seq_lmdks, seq[len(seq) - 1]) # seq_lmdks = np.sort(seq_lmdks) # Initialize previous and next landmarks. t_prv = 0 t_nxt = len(seq) + 1 # Add current timestamp to landmarks. seq_lmdks_tmp = np.copy(seq_lmdks) if t not in seq_lmdks_tmp[:]: seq_lmdks_tmp = np.append(seq_lmdks, t) seq_lmdks_tmp = np.sort(seq_lmdks_tmp) # Find the index of the current timestamp in the landmarks. idx, = np.where(seq_lmdks_tmp == t) i = idx[0] # Find previous landmark. if i != 0: t_prv = seq_lmdks_tmp[i - 1] # Find next landmark. if i != len(seq_lmdks_tmp) - 1: t_nxt = seq_lmdks_tmp[i + 1] return t_prv, t_nxt def plot_loss(title, e, a_b, a_f, a): ''' Plots the privacy loss of the time series. Parameters: title - The title of the plot. e - The privacy budget for data publishing. a_b - The backward privacy loss. a_f - The forward privacy loss. a - The total privacy loss. Returns: Nothing. ''' plt.rc('font', family='serif') plt.rc('font', size=10) plt.rc('text', usetex=True) p = np.arange(1.0, len(e) + 1, 1) plt.plot(p, e, label=r'$\varepsilon$', color='gray', marker='s') plt.plot(p, a_b, label=r'$\alpha^B$', color='#e52e4e', # red marker='<') plt.plot(p, a_f, label=r'$\alpha^F$', color='#009874', # green marker='>') plt.plot(p, a, label=r'$\alpha$', color='#6082b6', # blue marker='d') plt.axis([1.0, len(e), 0.0, 1.5*max(a)]) plt.legend(loc='best', frameon=True) plt.grid(axis='y', alpha=1.0) # Add grid on y axis. plt.xlabel('time') # Set x axis label. plt.ylabel('privacy loss') # Set y axis label. plt.title(title) plt.show() def cmp_loss(title, a, a_s, a_e, a_l): ''' Plots a comparison of the privacy loss of all models. Parameters: title - The title of the plot. a - The privacy loss of the basic model. a_s - The privacy loss of the static model. a_e - The privacy loss of the exponential model. a_l - The privacy loss of the linear model. Returns: Nothing. ''' plt.rc('font', family='serif') plt.rc('font', size=10) plt.rc('text', usetex=True) p = np.arange(1.0, len(a) + 1, 1) plt.plot(p, a, label=r'$\alpha$', color='red', marker='s') plt.plot(p, a_s, label=r'$\alpha_s$', color='green', marker='>') plt.plot(p, a_e, label=r'$\alpha_e$', color='blue', marker='d') plt.plot(p, a_l, label=r'$\alpha_l$', color='yellow', marker='<') plt.axis([1.0, len(a), 0.0, 1.5*max(a)]) plt.legend(loc='best', frameon=True) plt.grid(axis='y', alpha=1.0) # Add grid on y axis. plt.xlabel('time') # Set x axis label. plt.ylabel('privacy loss') # Set y axis label. plt.title(title) plt.show() def parse_args(): ''' Parse arguments. Mandatory: model, The model to be used: (b)asic, (s)tatic, (l)inear, (e)xponential. Optional: -c, --correlation, The correlation degree. -d, --debug, Enable debugging mode. -e, --epsilon, The available privacy budget. -m, --matrix, The size of the transition matrix. -o, --output, The path to the output directory. -t, --time, The time limit. -w, --window, The size of the event protection window. ''' # Create argument parser. parser = argparse.ArgumentParser() # Mandatory arguments. parser.add_argument('model', help='The model to be used.', choices=['b', 's', 'l', 'e'], type=str) # Optional arguments. parser.add_argument('-c', '--correlation', help='The correlation degree.', type=float, default=0.1) parser.add_argument('-d', '--debug', help='Enable debugging mode.', action='store_true') parser.add_argument('-e', '--epsilon', help='The available privacy budget.', type=float, default=1.0) parser.add_argument('-m', '--matrix', help='The matrix size.', type=int, default=2) parser.add_argument('-o', '--output', help='The path to the output directory.', type=str, default='') parser.add_argument('-t', '--time', help='The time limit.', type=int, default=1) parser.add_argument('-w', '--window', help='The size of the event protection window.', type=int, default=1) # Parse arguments. args = parser.parse_args() return args def main(args): global DEBUG, MISS, TOTAL DEBUG = args.debug MISS, TOTAL = 0, 0 e = [args.epsilon]*args.time # Generate transition matrices. p_b = gen_trans_mt(args.matrix, args.correlation) p_f = gen_trans_mt(args.matrix, args.correlation) if DEBUG: # Run basic for comparison. a_b = bpl(p_b, [0.0] * args.time, e, args.time) a_f = fpl(p_f, [0.0] * args.time, e, args.time) a = tpl(a_b, a_f, e) output_path = args.output +\ '/' + args.model +\ '_c' + str(args.correlation) +\ '_e' + str(args.epsilon) +\ '_m' + str(args.matrix) +\ '_t' + str(args.time) +\ '_w' + str(args.window) if args.model == 'b': # Basic # start_time = time.process_time() a_b = bpl_m(p_b, [0.0] * args.time, e, args.time) a_f = fpl_m(p_f, [0.0] * args.time, e, args.time) a = tpl(a_b, a_f, e) # duration1 = time.process_time() - start_time if DEBUG: print('\Basic\n' 't\tbpl\t\tfpl\t\ttpl') for i in range(0, args.time): print('%d\t%f\t%f\t%f' %(i + 1, a_b[i], a_f[i], a[i])) print('\nSUM\t%f\t%f\t%f' %(sum(a_b), sum(a_f), sum(a))) print('AVG\t%f\t%f\t%f' %(sum(a_b)/args.time, sum(a_f)/args.time, sum(a)/args.time)) plot_loss('Basic', e, a_b, a_f, a) if args.Returns: save_output(output_path, args.time, e, a_b, a_f, a) # start_time = time.process_time() # a_b_m = bpl_m(p_b, [0.0] * args.time, e, args.time) # a_f_m = fpl_m(p_f, [0.0] * args.time, e, args.time) # a_m = tpl(a_b_m, a_f_m, e) # duration2 = time.process_time() - start_time # diff = ((duration2 - duration1)/duration1)*100 # success = safe_div((TOTAL - MISS)*100, TOTAL) # print('##############################\n' # 'basic : %.4fs\n' # 'w/ mem: %.4fs\n' # '- succ: %d/%d (%.2f%%)\n' # 'diff : %.4fs (%.2f%%)\n' # 'OK : %r' # %(duration1, # duration2, # TOTAL - MISS, TOTAL, success, # duration2 - duration1, diff, # np.array_equal(a, a_m))) # MISS, TOTAL = 0, 0 elif args.model == 's': # Static # start_time = time.process_time() a_s_b, a_s_f = [None]*args.time, [None]*args.time for i in range(1, args.time + 1): a_s_b[i - 1] = bpl_s_m(p_b, e, i, args.window) a_s_f[i - 1] = fpl_s_m(p_f, e, i, args.time, args.window) a_s = tpl(a_s_b, a_s_f, e) # duration1 = time.process_time() - start_time if DEBUG: print('\nStatic\n' 't\tbpl\t\t\tfpl\t\t\ttpl') for i in range(0, args.time): print('%d\t%f (%.2f%%)\t%f (%.2f%%)\t%f (%.2f%%)' %(i + 1, a_s_b[i], (a_s_b[i] - a_b[i])/a_b[i]*100, a_s_f[i], (a_s_f[i] - a_f[i])/a_f[i]*100, a_s[i], (a_s[i] - a[i])/a[i]*100)) print('\nSUM\t%f (%.2f%%)\t%f (%.2f%%)\t%f (%.2f%%)' %(sum(a_s_b), ((sum(a_s_b) - sum(a_b))/sum(a_b))*100, sum(a_s_f), ((sum(a_s_f) - sum(a_f))/sum(a_f))*100, sum(a_s), ((sum(a_s) - sum(a))/sum(a))*100)) print('AVG\t%f (%.2f%%)\t%f (%.2f%%)\t%f (%.2f%%)' %(sum(a_s_b)/args.time, ((sum(a_s_b)/args.time - sum(a_b)/args.time)/(sum(a_b)/args.time))*100, sum(a_s_f)/args.time, ((sum(a_s_f)/args.time - sum(a_f)/args.time)/(sum(a_f)/args.time))*100, sum(a_s)/args.time, ((sum(a_s)/args.time - sum(a)/args.time)/(sum(a)/args.time))*100)) plot_loss('Static', e, a_s_b, a_s_f, a_s) if args.Returns: save_output(output_path, args.time, e, a_s_b, a_s_f, a_s) # start_time = time.process_time() # a_s_b_m, a_s_f_m = [None]*args.time, [None]*args.time # for i in range(1, args.time + 1): # a_s_b_m[i - 1] = bpl_s_m(p_b, e, i, args.window) # a_s_f_m[i - 1] = fpl_s_m(p_f, e, i, args.time, args.window) # a_s_m = tpl(a_s_b_m, a_s_f_m, e) # duration2 = time.process_time() - start_time # diff = ((duration2 - duration1)/duration1)*100 # success = safe_div((TOTAL - MISS)*100, TOTAL) # print('##############################\n' # 'static: %.4fs\n' # 'w/ mem: %.4fs\n' # '- succ: %d/%d (%.2f%%)\n' # 'diff : %.4fs (%.2f%%)\n' # 'OK : %r' # %(duration1, # duration2, # TOTAL - MISS, TOTAL, success, # duration2 - duration1, diff, # np.array_equal(a_s, a_s_m))) # MISS, TOTAL = 0, 0 elif args.model == 'l': # Linear l = 1 # start_time = time.process_time() a_l_b, a_l_f = [None]*args.time, [None]*args.time for i in range(1, args.time + 1): a_l_b[i - 1] = bpl_l_m(p_b, e, i, args.window, l) a_l_f[i - 1] = fpl_l_m(p_f, e, i, args.time, args.window, l) a_l = tpl(a_l_b, a_l_f, e) # duration1 = time.process_time() - start_time if DEBUG: print('\nLinear\n' 't\tbpl\t\t\tfpl\t\t\ttpl') for i in range(0, args.time): print('%d\t%f (%.2f%%)\t%f (%.2f%%)\t%f (%.2f%%)' %(i + 1, a_l_b[i], (a_l_b[i] - a_b[i])/a_b[i]*100, a_l_f[i], (a_l_f[i] - a_f[i])/a_f[i]*100, a_l[i], (a_l[i] - a[i])/a[i]*100)) print('\nSUM\t%f (%.2f%%)\t%f (%.2f%%)\t%f (%.2f%%)' %(sum(a_l_b), ((sum(a_l_b) - sum(a_b))/sum(a_b))*100, sum(a_l_f), ((sum(a_l_f) - sum(a_f))/sum(a_f))*100, sum(a_l), ((sum(a_l) - sum(a))/sum(a))*100)) print('AVG\t%f (%.2f%%)\t%f (%.2f%%)\t%f (%.2f%%)' %(sum(a_l_b)/args.time, ((sum(a_l_b)/args.time - sum(a_b)/args.time)/(sum(a_b)/args.time))*100, sum(a_l_f)/args.time, ((sum(a_l_f)/args.time - sum(a_f)/args.time)/(sum(a_f)/args.time))*100, sum(a_l)/args.time, ((sum(a_l)/args.time - sum(a)/args.time)/(sum(a)/args.time))*100)) plot_loss('Linear', e, a_l_b, a_l_f, a_l) if args.Returns: save_output(output_path, args.time, e, a_l_b, a_l_f, a_l) # start_time = time.process_time() # a_l_b_m, a_l_f_m = [None]*args.time, [None]*args.time # for i in range(1, args.time + 1): # a_l_b_m[i - 1] = bpl_l_m(p_b, e, i, args.window, l) # a_l_f_m[i - 1] = fpl_l_m(p_f, e, i, args.time, args.window, l) # a_l_m = tpl(a_l_b_m, a_l_f_m, e) # duration2 = time.process_time() - start_time # diff = ((duration2 - duration1)/duration1)*100 # success = safe_div((TOTAL - MISS)*100, TOTAL) # print('##############################\n' # 'linear: %.4fs\n' # 'w/ mem: %.4fs\n' # '- succ: %d/%d (%.2f%%)\n' # 'diff : %.4fs (%.2f%%)\n' # 'OK : %r' # %(duration1, # duration2, # TOTAL - MISS, TOTAL, success, # duration2 - duration1, diff, # np.array_equal(a_l, a_l_m))) # MISS, TOTAL = 0, 0 elif args.model == 'e': # Exponential h = 1 # start_time = time.process_time() a_e_b, a_e_f = [None]*args.time, [None]*args.time for i in range(1, args.time + 1): a_e_b[i - 1] = bpl_e_m(p_b, e, i, args.window, h) a_e_f[i - 1] = fpl_e_m(p_f, e, i, args.time, args.window, h) a_e = tpl(a_e_b, a_e_f, e) # duration1 = time.process_time() - start_time if DEBUG: print('\nExponential\n' 't\tbpl\t\t\tfpl\t\t\ttpl') for i in range(0, args.time): print('%d\t%f (%.2f%%)\t%f (%.2f%%)\t%f (%.2f%%)' %(i + 1, a_e_b[i], (a_e_b[i] - a_b[i])/a_b[i]*100, a_e_f[i], (a_e_f[i] - a_f[i])/a_f[i]*100, a_e[i], (a_e[i] - a[i])/a[i]*100)) print('\nSUM\t%f (%.2f%%)\t%f (%.2f%%)\t%f (%.2f%%)' %(sum(a_e_b), ((sum(a_e_b) - sum(a_b))/sum(a_b))*100, sum(a_e_f), ((sum(a_e_f) - sum(a_f))/sum(a_f))*100, sum(a_e), ((sum(a_e) - sum(a))/sum(a))*100)) print('AVG\t%f (%.2f%%)\t%f (%.2f%%)\t%f (%.2f%%)' %(sum(a_e_b)/args.time, ((sum(a_e_b)/args.time - sum(a_b)/args.time)/(sum(a_b)/args.time))*100, sum(a_e_f)/args.time, ((sum(a_e_f)/args.time - sum(a_f)/args.time)/(sum(a_f)/args.time))*100, sum(a_e)/args.time, ((sum(a_e)/args.time - sum(a)/args.time)/(sum(a)/args.time))*100)) plot_loss('Exponential', e, a_e_b, a_e_f, a_e) if args.Returns: save_output(output_path, args.time, e, a_e_b, a_e_f, a_e) # start_time = time.process_time() # a_e_b_m, a_e_f_m = [None]*args.time, [None]*args.time # for i in range(1, args.time + 1): # a_e_b_m[i - 1] = bpl_e_m(p_b, e, i, args.window, h) # a_e_f_m[i - 1] = fpl_e_m(p_f, e, i, args.time, args.window, h) # a_e_m = tpl(a_e_b_m, a_e_f_m, e) # duration2 = time.process_time() - start_time # diff = ((duration2 - duration1)/duration1)*100 # success = safe_div((TOTAL - MISS)*100, TOTAL) # print('##############################\n' # 'exp : %.4fs\n' # 'w/ mem: %.4fs\n' # '- succ: %d/%d (%.2f%%)\n' # 'diff : %.4fs (%.2f%%)\n' # 'OK : %r' # %(duration1, # duration2, # TOTAL - MISS, TOTAL, success, # duration2 - duration1, diff, # np.array_equal(a_e, a_e_m))) # MISS, TOTAL = 0, 0 # if DEBUG: # cmp_loss('Backward', a_b, a_s_b, a_l_b, a_e_b) # cmp_loss('Forward', a_f, a_s_f, a_l_f, a_e_f) # cmp_loss('Total', a, a_s, a_l, a_e) if __name__ == '__main__': try: args = parse_args() print('##############################\n' 'Correlation: %f\n' 'Debug : %r\n' 'Epsilon : %f\n' 'Matrix : %d\n' 'Model : %s\n' 'Output : %s\n' 'Time : %d\n' 'Window : %d\n' '##############################' %(args.correlation, args.debug, args.epsilon, args.matrix, args.model, args.output, args.time, args.window), flush=True) start_time = time.process_time() main(args) end_time = time.process_time() print('##############################\n' 'Time : %.4fs\n' '##############################\n' %(end_time - start_time), flush=True) except KeyboardInterrupt: print('Interrupted by user.') exit()