#!/usr/bin/env python3 import argparse import collections import csv import datetime import math from matplotlib import pyplot as plt import numpy as np import os import random from scipy.sparse.linalg import eigs from scipy.stats import laplace from scipy.stats import norm import sympy as sp import time import xxhash # Debugging DEBUG = False # Variable to check. # When enabled print something. # Memoization MEM = {} # The cache. MISS = 0 # Number of additions to the cache. TOTAL = 0 # Number of cache accesses. ''' Read data from a file. Parameters: path - The relative path to the data file. Returns: data - A list of tuples [uid, timestamp, lng, lat, loc]. ''' def load_data(path): print('Loading data from', os.path.abspath(path), '... ', end='') data = [] try: with open(path, 'r') as f: reader = csv.reader(f, delimiter=',') data = list(reader) print('OK.') return data except IOError: print('Oops!') exit() ''' Save output to a file. Parameters: path - The relative path to the output file. t - The number of timestamps. e - The privacy budget at each timestamp. a_b - The backward privacy loss at each timestamp. a_f - The forward privacy loss at each timestamp. a - The temporal privacy loss at each timestamp. Returns: Nothing. ''' def save_output(path, t, e, a_b, a_f, a): # timestamp = time.strftime('%Y%m%d%H%M%S') print('Saving output to %s... ' %(path), end='', flush=True) os.makedirs(os.path.dirname(path), exist_ok=True) with open(path, 'w') as f: w = csv.writer(f, delimiter=';') w.writerow(['t', 'e', 'bpl', 'fpl', 'tpl']) w.writerows(zip(list(range(1, t + 1)), e, a_b, a_f, a)) print('OK.', flush=True) ''' Get all the timestamps from the input data. Parameters: data - The input data set. Returns: timestamps - An ndarray of all of the timestamps from the input data. ''' def get_timestamps(data): print('Getting a list of all timestamps... ', end='', flush=True) timestamps = np.sort(np.unique(np.array(data)[:, 1])) if not len(timestamps): print('No timestamps found.', flush=True) exit() # Check timestamps' order. prev = None for t in timestamps: curr = time.mktime(datetime.datetime.strptime(t, '%Y-%m-%d %H:%M').timetuple()) if prev: if curr < prev: print('Wrong order.', flush=True) exit() prev = curr print('OK.', flush=True) if DEBUG: print(timestamps, len(timestamps)) return timestamps ''' Get all the unique locations from the input data. Parameters: data - The input data set. Returns: locs - A sorted ndarray of all the unique locations int the input data. ''' def get_locs(data): print('Getting a list of all locations... ', end='', flush=True) locs = np.sort(np.unique(np.array(data)[:, 4].astype(np.int))) if not len(locs): print('No locations found.', flush=True) exit() print('OK.', flush=True) if DEBUG: print(list(map(str, locs)), len(locs)) return list(map(str, locs)) ''' Get the counts at every location for a specific timestamp. Parameters: data - The input data set. t - The timestamp of interest. Returns: cnts - A dict {loc:cnt} with the counts at every location for a specific timestamp. ''' def get_cnts(data, t): print('Getting all counts at %s... ' %(t), end='', flush=True) locs = get_locs(data) cnts = dict.fromkeys(locs, 0) for d in data: if d[1] == t: cnts[int(d[4])] += 1 print('OK.', flush=True) if DEBUG: print(cnts) return cnts ''' Get the counts at every location for every timestamp. Parameters: data - The input data set. Returns: cnts - A dict {timestamp:loc} with all the counts at every location for every timestamp. ''' def get_all_cnts(data): cnts = {} for d in data: key = d[1] + '@' + d[4] cnts[key] = cnts.get(key, 0) + 1 if DEBUG: print(cnts) return cnts ''' Get a list of unique users in the input data set. Parameters: data - The input data set. Returns: users - An ndarray of all unique users. ''' def get_usrs(data): users = np.sort(np.unique(np.array(data)[:, 0].astype(np.int))) if not len(users): print('No users found.') exit() if DEBUG: print(users, len(users)) return users ''' Get the data of a particular user from a data set. Parameters: data - The input data set. id - The user identifier. Returns: output - A list of the data of the targeted user. ''' def get_usr_data(data, id): output = [] for d in data: if (d[0] == str(id)): output.append(d) if DEBUG: print(id, output) return output ''' Get the data of every user in a data set. Parameters: data - The input data set. Returns: output - A dict {usr, [usr_data]} with the data of each user. ''' def get_usrs_data(data): output = {} for d in data: output[d[0]] = output.get(d[0], []) + [d] return output ''' Get the trajectory of a user from her data. Parameters: data - The data of the user. Returns: traj - A list [(timestamp, loc)] with the locations and corresponding timestamps that the user was at. ''' def get_usr_traj(data): traj = [] for d in data: traj.append((d[1], d[4])) if DEBUG: print(traj) return traj ''' Get all the possible transitions. Parameters: data - The input data set. Returns: trans - A set with all the possible forward transitions in the input. ''' def get_poss_trans(data): print('Getting possible transitions... ', end='', flush=True) trans = set() for u, u_data in data.items(): traj = get_usr_traj(u_data) trans.update(list(zip(traj, traj[1:]))) if len(trans) == 0: print('None.', flush=True) exit() print('OK.', flush=True) return trans ''' Get all backward transitions in a data set. Parameters: data - The input data set. Returns: trans - A dict {(t, t-1):[transitions]} with all the backward transitions at every sequential timestamp pair in the input data set. ''' def get_bwd_trans(data): print('Getting all backward transitions... ', end='', flush=True) trans = {} for u, u_data in data.items(): traj = get_usr_traj(u_data) for i, j in zip(traj[1:], traj): trans[(i[0], j[0])] = trans.get((i[0], j[0]), []) + [(i[1], j[1])] if len(trans) == 0: print('None.', flush=True) exit() print('OK.', flush=True) return trans ''' Get all forward transitions in a data set. Parameters: data - The input data set. Returns: trans - A dict {(t-1, t):[transitions]} with all the forward transitions at every sequential timestamp pair in the input data set. ''' def get_fwd_trans(data): print('Getting all forward transitions... ', end='', flush=True) trans = {} for u, u_data in data.items(): traj = get_usr_traj(u_data) for i, j in zip(traj, traj[1:]): trans[(i[0], j[0])] = trans.get((i[0], j[0]), []) + [(i[1], j[1])] if len(trans) == 0: print('None.', flush=True) exit() print('OK.', flush=True) return trans ''' Divide two numbers. If the divisor is 0 return inf. Parameters: a - The dividend. b - The divisor. Returns: The float result of the division. ''' def safe_div(a, b): if b == 0: return math.inf return float(a/b) ''' Calculate the maximum value of the objective function. Parameters: q - A row from the transition matrix. d - Another row from the transition matrix. a - The backward/forward privacy loss of the previous/next timestamp. Returns: The maximum value of the objective function. ''' def max_val(q, d, a): if a == math.inf: return math.nan return (q*(math.exp(a) - 1) + 1)/(d*(math.exp(a) - 1) + 1) ''' Find two different rows (q and d) of a transition matrix (p) that maximize the product of the objective function and return their sums. Parameters: p - The transition matrix representing the backward/forward correlations. a - The backward/forward privacy loss of the previous/next timestamp. Returns: sum_q - The sum of the elements of q. sum_d - The sum of the elements of d. ''' def find_qd(p, a): res = 0.0 sum_q, sum_d = 0.0, 0.0 for q in p: # A row from the transition matrix. for d in p: # Another row from the transition matrix. if not np.array_equal(q, d) and sum(q) and sum(d): plus = [] # Indexes of the elements of q+ and d+. for i in range(np.shape(p)[1]): # Iterate each column. if q[i] > d[i]: plus.append(i) s_q, s_d = 0.0, 0.0 update = True while update: update = False for i in plus: s_q += q[i] s_d += d[i] max_tmp = max_val(s_q, s_d, a) for i in plus: if safe_div(q[i], d[i]) <= max_tmp: plus.remove(i) update = True res_tmp = math.log(max_val(s_q, s_d, a)) if res < res_tmp: res = res_tmp sum_q, sum_d = s_q, s_d return sum_q, sum_d ''' Generate data. Parameters: usrs - The number of users. timestamps - The number of timestamps. locs - The numner of locations. Returns: data - The generated data. ''' def gen_data(usrs, timestamps, locs): print('Generating data... ', end='', flush=True) # Generate timestamps. ts = [] t_start = datetime.datetime.now() for t in range(timestamps): t_start += datetime.timedelta(minutes=t) ts.append(str(t_start.strftime('%Y-%m-%d %H:%M'))) # Randomly generate unique possible transitions. trans = set() for i in range(locs**2): trans.add((str(random.randint(1, locs)), str(random.randint(1, locs)))) # Generate the data. data = [] for u in range(1, usrs + 1): # Choose a random starting location. l_start = random.sample(trans, 1)[0][0] for t in ts: # Choose a random next location. while True: l_nxt = str(random.randint(1, locs)) if (l_start, l_nxt) in trans: data.append([str(u), str(t), str(l_nxt), str(l_nxt), str(l_nxt)]) break print('OK.', flush=True) return data ''' Generate a transition matrix. Parameters: n - The dimension of the matrix. s - The correlation degree of each row [0, 1]. The lower its value, the lower the degree of uniformity of each row. Returns: p_ - The transition matrix. ''' def gen_trans_mt(n, s): if DEBUG: print('Generating transition matrix %dx%d with s = %.4f... ' %(n, n, s), end='', flush=True) p = np.zeros((n, n), float) np.fill_diagonal(p, 1.0) p_ = np.zeros((n, n), float) for j, p_j in enumerate(p[:, ]): for k, p_jk in enumerate(p_j): p_[j, k] = round((p[j, k] + s) / (sum(p_j) + len(p_j)*s), 4) if DEBUG: print('OK.', flush=True) if DEBUG: print(p_) return p_ ''' Get the transition matrix Parameters: locs - A list of all the locations. trans - A list of all transitions. Returns: p - A 2d dict {{locs}{locs}} containing the corresponding location transition probabilities. ''' def get_trans_mt(locs, trans): if DEBUG: print('Generating the transition matrix... ', end='', flush=True) # Initialize the transition matrix. p = collections.defaultdict(dict) for i in locs: for j in locs: p[i][j] = 0.0 # Populate the transition matrix. for i in locs: for j in locs: # Count the transitions from i to j. cnt = collections.Counter(trans)[(i, j)] # Count the transitions from i to any location. sum_cnts = len([tr for tr in trans if tr[0] == i]) res = 0.0 if cnt != 0 and sum_cnts != 0: res = cnt/sum_cnts # Update transition matrix. p[i][j] = res if DEBUG: print('OK.', flush=True) for i in locs: print(p[i]) return p ''' Calculate the measure-theoretic (Kolmogorov-Sinai) entropy of a transition matrix. Parameters: mt - A 2d dict transition matrix. Returns: h - The Kolmogorov-Sinai entropy of the matrix. ''' def get_entropy(mt): if DEBUG: print('Calculating the measure-theoretic entropy... ', end='', flush=True) h = 0.0 # Get the 2d array of the transition matrix. a = get_2darray(mt) # Check if the transition matrix is empty. if np.allclose(a, np.zeros((len(a), len(a)), float)): return h # Find the stationary distribution (v), written as a row # vector, that does not change under the application of # the transition matrix. In other words, the left eigenvector # associated with eigenvalue (w) 1. It is also called # Perron-Frobenius eigenvector, since it is a stochastic # matrix and the largest magnitude of the eigenvalues must # be 1. We use the transpose of the transition matrix to get # the left eigenvector. w, v = eigs(a.T, k=1, which='LM') # Normalize the eigenvector. The vector is a probability # distribution, and therefore its sum must be 1. Thus, we use # its absolute elements' values and its l1-norm to normalize it. p = np.absolute(v[:,0].real) p_norm = np.linalg.norm(p, ord=1) if p_norm: p = p/p_norm # Calculate the entropy. if np.isclose(w.real[0], 1, atol=1e-03): for i in range(len(a)): for j in range(len(a)): # When the transition probability is 0 use the limiting value (0). if not np.isclose(a[i][j], 0, atol=1e-03) and not np.isclose(p[i], 0, atol=1e-03): # Binary logarithm measures entropy in bits. h += -p[i]*a[i][j]*math.log2(a[i][j]) if DEBUG: print(h, flush=True) return h ''' Convert a 2d dict to a 2d array. Parameters: mt - The 2d dict. Returns: p - The 2d numpy array. ''' def get_2darray(mt): if type(mt) == type(np.array([])): return mt p = np.zeros((len(mt), len(mt)), float) for i, mt_i in enumerate(mt.items()): p[i] = list(mt_i[1].values()) return p ''' Get a Laplace probability distribution. Parameters: ts - The points of the distribution. t - The location of the distribution. sc - The scale of the distribution. Returns: The probability distribution. ''' def get_laplace_pd(ts, t, sc): x = np.arange(0, len(ts), 1) loc = np.where(ts == t) return laplace.pdf(x, loc=loc, scale=sc)[0] ''' Get a Gaussian probability distribution. Parameters: ts - The points of the distribution. t - The location of the distribution. sc - The scale of the distribution. Returns: The probability distribution. ''' def get_norm_pd(ts, t, sc): x = np.arange(0, len(ts), 1) loc = np.where(ts == t) return norm.pdf(x, loc=loc, scale=sc)[0] ''' Get a sample from the time series. Parameters: ts - An ndarray of the timestamps. t - The current timestamp. pd - The probability distribution. ptn - The desired portion [0, 1] of the non-zero elements of the probability distribution to be sampled. Returns: spl - An ndarray of the sampled timestamps. ''' def get_sample(ts, t, pct, pd): if DEBUG: print('Sampling %.2f%% of %s at %s... ' %(pct*100, ts, t), end='', flush=True) # Check that it is a valid timestamp. if not t in ts: print('%s is not in the time series.' %(t)) exit() # Remove the current timestamp from the possible sample. i = np.where(ts == t) ts = np.delete(ts, i) pd = np.delete(pd, i) # Make sure that the probabilities sum to 1. pd_norm = pd/np.linalg.norm(pd, ord=1) # Get the sorted sample. spl = np.sort(np.random.choice(ts, size=int(np.count_nonzero(pd)*pct), replace=False, p=pd_norm)) if DEBUG: print(spl, flush=True) return spl ''' Calculate the backward/forward privacy loss at the current timestamp. Parameters: p - The transition matrix representing the backward/forward temporal correlations. a - The privacy loss of the previous/next timestamp. e - The privacy budget for data publishing. Returns: The backward/forward privacy loss at the current timestamp. ''' def priv_l(p, a, e): sum_q, sum_d = find_qd(p, a) return math.log(max_val(sum_q, sum_d, a)) + e ''' Calculate the backward/forward privacy loss at the current timestamp using memoization. Parameters: p - The transition matrix representing the backward/forward temporal correlations. a - The privacy loss of the previous/next timestamp. e - The privacy budget for data publishing. Returns: The backward/forward privacy loss at the current timestamp. ''' def priv_l_m(p, a, e): key = xxhash.xxh64(p).hexdigest() + str(a) + str(e) global MEM, TOTAL, MISS TOTAL += 1 result = MEM.get(key) if not result: MISS += 1 sum_q, sum_d = find_qd(p, a) result = math.log(max_val(sum_q, sum_d, a)) + e MEM[key] = result return result ''' Calculate the total backward privacy loss at every timestamp. Parameters: p - The transition matrix representing the backward temporal correlations. a - The backward privacy loss of every release. e - The privacy budget for data publishing. t - The time limit. Returns: a - The backward privacy loss at every timestamp due to the previous data releases. ''' def bpl(p, a, e, t): a[0] = e[0] for i in range(1, t): a[i] = priv_l(p, a[i - 1], e[i]) return a ''' Calculate the total backward privacy loss at the current timestamp with memoization. Parameters: p - The transition matrix representing the backward temporal correlations. a - The backward privacy loss of the current release at all previous timestamps. e - The privacy budget for data publishing. t - The time limit. Returns: a - The backward privacy loss at the current timestamp due to the previous data releases. ''' def bpl_m(p, a, e, t): a[0] = e[0] for i in range(1, t): a[i] = priv_l_m(p, a[i - 1], e[i]) return a def bpl_lmdk_mem(p, a, e, t, lmdk): # t is (near) the landmark if lmdk == t - 1 or t == lmdk: return np.array([0]*len(a)) # There is no landmark before if lmdk < 1: lmdk = 1 a[0] = e[0] for i in range(lmdk, t): a[i] = priv_l_m(p, a[i - 1], e[i]) return a ''' Calculate the total backward privacy loss at the current timestamp using the static model, i.e., previous releases are grouped in a window of static size. Parameters: p - The transition matrix representing the backward temporal correlations. e - The privacy budget for data publishing. i - The timestamp of interest. w - The window size to group previous releases. Returns: a - The backward privacy loss at the current timestamp due to the previous data releases. ''' def bpl_s(p, e, i, w): if i - w > 1: # print('bpl_s: %d - %d [%d]' %(i, i - w, w)) return priv_l(np.linalg.matrix_power(p, w), bpl_s(p, e, i - w, w), e[i - 1]) elif i - w <= 1: # print('bpl_s: %d - %d [%d]' %(i, 1, i - 1)) return priv_l(np.linalg.matrix_power(p, i - 1), e[0], e[i - 1]) elif i == 1: # print('bpl_s: %d' %(i)) return e[0] ''' Calculate the total backward privacy loss at the current timestamp using the static model, i.e., previous releases are grouped in a window of static size, using memoization. Parameters: p - The transition matrix representing the backward temporal correlations. e - The privacy budget for data publishing. i - The timestamp of interest. w - The window size to group previous releases. Returns: a - The backward privacy loss at the current timestamp due to the previous data releases. ''' def bpl_s_m(p, e, i, w): if i - w > 1: return priv_l_m(np.linalg.matrix_power(p, w), bpl_s_m(p, e, i - w, w), e[i - 1]) elif i - w <= 1: return priv_l_m(np.linalg.matrix_power(p, i - 1), e[0], e[i - 1]) elif i == 1: return e[0] ''' Calculate the total backward privacy loss at the current timestamp using the linear model, i.e., previous releases are grouped in a window of a size that increases linearly. Parameters: p - The transition matrix representing the backward temporal correlations. e - The privacy budget for data publishing. i - The timestamp of interest. w - The window size to group previous releases. l - The linearly increasing coefficient that affects the window size. Returns: a - The backward privacy loss at the current timestamp due to the previous data releases. ''' def bpl_l(p, e, i, w, l): if i - w*l > 1: # print('bpl_l: %d - %d [%d]' %(i, i - w*l, w*l)) return priv_l(np.linalg.matrix_power(p, w*l), bpl_l(p, e, i - w*l, w, l + 1), e[i - 1]) elif i - w*l <= 1: # print('bpl_l: %d - %d [%d]' %(i, 1, i - 1)) return priv_l(np.linalg.matrix_power(p, i - 1), e[0], e[i - 1]) elif i == 1: # print('bpl_l: %d' %(1)) return e[0] ''' Calculate the total backward privacy loss at the current timestamp using the linear model, i.e., previous releases are grouped in a window of a size that increases linearly, using memoization. Parameters: p - The transition matrix representing the backward temporal correlations. e - The privacy budget for data publishing. i - The timestamp of interest. w - The window size to group previous releases. l - The linearly increasing coefficient that affects the window size. Returns: a - The backward privacy loss at the current timestamp due to the previous data releases. ''' def bpl_l_m(p, e, i, w, l): if i - w*l > 1: return priv_l_m(np.linalg.matrix_power(p, w*l), bpl_l_m(p, e, i - w*l, w, l + 1), e[i - 1]) elif i - w*l <= 1: return priv_l_m(np.linalg.matrix_power(p, i - 1), e[0], e[i - 1]) elif i == 1: return e[0] ''' Calculate the total backward privacy loss at the current timestamp using the exponential model, i.e., previous releases are grouped in a window of a size that increases exponentially. Parameters: p - The transition matrix representing the backward temporal correlations. e - The privacy budget for data publishing. i - The timestamp of interest. w - The window size to group previous releases. h - The exponentially increasing coefficient that affects the window size. Returns: a - The backward privacy loss at the current timestamp due to the previous data releases. ''' def bpl_e(p, e, i, w, h): if i - w**h > 1: # print('bpl_e: %d - %d [%d]' %(i, i - w**h, w**h)) return priv_l(np.linalg.matrix_power(p, w**h), bpl_e(p, e, i - w**h, w, h + 1), e[i - 1]) elif i - w**h <= 1: # print('bpl_e: %d - %d [%d]' %(i, 1, i - 1)) return priv_l(np.linalg.matrix_power(p, i - 1), e[0], e[i - 1]) elif i == 1: # print('bpl_e: %d' %(1)) return e[0] ''' Calculate the total backward privacy loss at the current timestamp using the exponential model, i.e., previous releases are grouped in a window of a size that increases exponentially, using memoization. Parameters: p - The transition matrix representing the backward temporal correlations. e - The privacy budget for data publishing. i - The timestamp of interest. w - The window size to group previous releases. h - The exponentially increasing coefficient that affects the window size. Returns: a - The backward privacy loss at the current timestamp due to the previous data releases. ''' def bpl_e_m(p, e, i, w, h): if i - w**h > 1: return priv_l_m(np.linalg.matrix_power(p, w**h), bpl_e_m(p, e, i - w**h, w, h + 1), e[i - 1]) elif i - w**h <= 1: return priv_l_m(np.linalg.matrix_power(p, i - 1), e[0], e[i - 1]) elif i == 1: return e[0] ''' Calculate the total forward privacy loss at the current timestamp. Parameters: p - The transition matrix representing the forward temporal correlations. a - The forward privacy loss of the current release at all next timestamps. e - The privacy budget for data publishing. t - The time limit. Returns: a - The forward privacy loss at the current timestamp due to the next data releases. ''' def fpl(p, a, e, t): a[t - 1] = e[t - 1] for i in range(t - 2, -1, -1): a[i] = priv_l(p, a[i + 1], e[i]) return a ''' Calculate the total forward privacy loss at the current timestamp, using memoization. Parameters: p - The transition matrix representing the forward temporal correlations. a - The forward privacy loss of the current release at all next timestamps. e - The privacy budget for data publishing. t - The time limit. Returns: a - The forward privacy loss at the current timestamp due to the next data releases. ''' def fpl_m(p, a, e, t): a[t - 1] = e[t - 1] for i in range(t - 2, -1, -1): a[i] = priv_l_m(p, a[i + 1], e[i]) return a def fpl_lmdk_mem(p, a, e, t, lmdk): # t is (near) the landmark if lmdk == t + 1 or t == lmdk: return np.array([0]*len(a)) # There is no landmark after if lmdk > len(e): lmdk = len(e) a[lmdk - 1] = e[lmdk - 1] for i in range(lmdk - 2, t - 2, -1): a[i] = priv_l_m(p, a[i + 1], e[i]) return a ''' Calculate the total forward privacy loss at the current timestamp using the static model, i.e., next releases are grouped in a window of static size. Parameters: p - The transition matrix representing the forward temporal correlations. e - The privacy budget for data publishing. i - The timestamp of interest. w - The window size to group next releases. Returns: a - The forward privacy loss at the current timestamp due to the next data releases. ''' def fpl_s(p, e, i, t, w): if i + w < t: # print('fpl_s: %d - %d [%d]' %(i, i + w, w)) return priv_l(np.linalg.matrix_power(p, w), fpl_s(p, e, i + w, t, w), e[i - 1]) elif i + w >= t: # print('fpl_s: %d - %d [%d]' %(i, t, t - i)) return priv_l(np.linalg.matrix_power(p, t - i), e[t - 1], e[i - 1]) elif i == t: # print('fpl_s: %d' %(t)) return e[t - 1] ''' Calculate the total forward privacy loss at the current timestamp using the static model, i.e., next releases are grouped in a window of static size, using memoization. Parameters: p - The transition matrix representing the forward temporal correlations. e - The privacy budget for data publishing. i - The timestamp of interest. w - The window size to group next releases. Returns: a - The forward privacy loss at the current timestamp due to the next data releases. ''' def fpl_s_m(p, e, i, t, w): if i + w < t: return priv_l_m(np.linalg.matrix_power(p, w), fpl_s_m(p, e, i + w, t, w), e[i - 1]) elif i + w >= t: return priv_l_m(np.linalg.matrix_power(p, t - i), e[t - 1], e[i - 1]) elif i == t: return e[t - 1] ''' Calculate the total forward privacy loss at the current timestamp using the linear model, i.e., next releases are grouped in a window of a size that increases linearly. Parameters: p - The transition matrix representing the forward temporal correlations. e - The privacy budget for data publishing. i - The timestamp of interest. w - The window size to group next releases. l - The linearly increasing coefficient that affects the window size. Returns: a - The forward privacy loss at the current timestamp due to the next data releases. ''' def fpl_l(p, e, i, t, w, l): if i + w*l < t: # print('fpl_l: %d - %d [%d]' %(i, i + w*l, w*l)) return priv_l(np.linalg.matrix_power(p, w*l), fpl_l(p, e, i + w*l, t, w, l + 1), e[i - 1]) elif i + w*l >= t: # print('fpl_l: %d - %d [%d]' %(i, t, t - i)) return priv_l(np.linalg.matrix_power(p, t - i), e[t - 1], e[i - 1]) elif i == t: # print('fpl_l: %d' %(t)) return e[t - 1] ''' Calculate the total forward privacy loss at the current timestamp using the linear model, i.e., next releases are grouped in a window of a size that increases linearly, using memoization. Parameters: p - The transition matrix representing the forward temporal correlations. e - The privacy budget for data publishing. i - The timestamp of interest. w - The window size to group next releases. l - The linearly increasing coefficient that affects the window size. Returns: a - The forward privacy loss at the current timestamp due to the next data releases. ''' def fpl_l_m(p, e, i, t, w, l): if i + w*l < t: return priv_l_m(np.linalg.matrix_power(p, w*l), fpl_l_m(p, e, i + w*l, t, w, l + 1), e[i - 1]) elif i + w*l >= t: return priv_l_m(np.linalg.matrix_power(p, t - i), e[t - 1], e[i - 1]) elif i == t: return e[t - 1] ''' Calculate the total forward privacy loss at the current timestamp using the exponential model, i.e., next releases are grouped in a window of a size that increases exponentially. Parameters: p - The transition matrix representing the forward temporal correlations. e - The privacy budget for data publishing. i - The timestamp of interest. w - The window size to group next releases. h - The exponentially increasing coefficient that affects the window size. Returns: a - The forward privacy loss at the current timestamp due to the next data releases. ''' def fpl_e(p, e, i, t, w, h): if i + w**h < t: # print('fpl_e: %d - %d [%d]' %(i, i + w**h, w**h)) return priv_l(np.linalg.matrix_power(p, w**h), fpl_e(p, e, i + w**h, t, w, h + 1), e[i - 1]) elif i + w**h >= t: # print('fpl_e: %d - %d [%d]' %(i, t, t - i)) return priv_l(np.linalg.matrix_power(p, t - i), e[t - 1], e[i - 1]) elif i == t: # print('fpl_e: %d' %(t)) return e[t - 1] ''' Calculate the total forward privacy loss at the current timestamp using the exponential model, i.e., next releases are grouped in a window of a size that increases exponentially, using memoization. Parameters: p - The transition matrix representing the forward temporal correlations. e - The privacy budget for data publishing. i - The timestamp of interest. w - The window size to group next releases. h - The exponentially increasing coefficient that affects the window size. Returns: a - The forward privacy loss at the current timestamp due to the next data releases. ''' def fpl_e_m(p, e, i, t, w, h): if i + w**h < t: return priv_l_m(np.linalg.matrix_power(p, w**h), fpl_e_m(p, e, i + w**h, t, w, h + 1), e[i - 1]) elif i + w**h >= t: return priv_l_m(np.linalg.matrix_power(p, t - i), e[t - 1], e[i - 1]) elif i == t: return e[t - 1] ''' Calculate the total privacy loss at every timestamp. Parameters: bpl - The backward privacy loss. fpl - The forward privacy loss. e - The privacy budget for data publishing. Returns: The list of total privacy loss at every timestamp. ''' def tpl(bpl, fpl, e): return [x + y - z for (x, y, z) in zip(bpl, fpl, e)] ''' Calculate the temporal privacy loss at every timestamp taking into account landmarks. Parameters: e - The privacy budget for data publishing. p_b - The transition matrix representing the backward temporal correlations. p_f - The transition matrix representing the forward temporal correlations. seq - The point sequence. lmdks - The landmarks. Returns: a_b - The backward privacy loss at the current timestamp due to the previous data releases. a_f - The forward privacy loss at the current timestamp due to the next data releases. a - The total privacy loss at every timestamp taking into account landmarks. ''' def tpl_lmdk_mem(e, p_b, p_f, seq, lmdks): a_b = np.zeros(len(seq)) a_f = np.zeros(len(seq)) a = np.zeros(len(seq)) for t in seq: t_prv, t_nxt = get_limits(t, seq, lmdks) a_b[t - 1] = bpl_lmdk_mem(p_b, np.zeros(max(seq)), e, t, t_prv)[t - 1] a_f[t - 1] = fpl_lmdk_mem(p_f, np.zeros(max(seq)), e, t, t_nxt)[t - 1] if a_b[t - 1] > 0 and a_f[t - 1] > 0: a[t - 1] = a_b[t - 1] + a_f[t - 1] - e[t - 1] elif a_b[t - 1] > 0 or a_f[t - 1] > 0: a[t - 1] = a_b[t - 1] + a_f[t - 1] else: a[t - 1] = e[t - 1] return a_b, a_f, a ''' Get the limits for the calculation of temporal privacy loss. Parameters: t - The current timestamp. seq - The point sequence. lmdks - The landmarks. Returns: t_prv - The previous landmark. t_nxt - The next landmark. ''' def get_limits(t, seq, lmdks): # Add landmark limits. seq_lmdks = np.copy(lmdks) # if seq[0] not in seq_lmdks: # seq_lmdks = np.append(seq_lmdks, seq[0]) # if seq[len(seq) - 1] not in seq_lmdks: # seq_lmdks = np.append(seq_lmdks, seq[len(seq) - 1]) # seq_lmdks = np.sort(seq_lmdks) # Initialize previous and next landmarks. t_prv = 0 t_nxt = len(seq) + 1 # Add current timestamp to landmarks. seq_lmdks_tmp = np.copy(seq_lmdks) if t not in seq_lmdks_tmp[:]: seq_lmdks_tmp = np.append(seq_lmdks, t) seq_lmdks_tmp = np.sort(seq_lmdks_tmp) # Find the index of the current timestamp in the landmarks. idx, = np.where(seq_lmdks_tmp == t) i = idx[0] # Find previous landmark. if i != 0: t_prv = seq_lmdks_tmp[i - 1] # Find next landmark. if i != len(seq_lmdks_tmp) - 1: t_nxt = seq_lmdks_tmp[i + 1] return t_prv, t_nxt ''' Plots the privacy loss of the time series. Parameters: title - The title of the plot. e - The privacy budget for data publishing. a_b - The backward privacy loss. a_f - The forward privacy loss. a - The total privacy loss. Returns: Nothing. ''' def plot_loss(title, e, a_b, a_f, a): plt.rc('font', family='serif') plt.rc('font', size=10) plt.rc('text', usetex=True) p = np.arange(1.0, len(e) + 1, 1) plt.plot(p, e, label=r'$\varepsilon$', color='gray', marker='s') plt.plot(p, a_b, label=r'$\alpha^B$', color='#e52e4e', # red marker='<') plt.plot(p, a_f, label=r'$\alpha^F$', color='#009874', # green marker='>') plt.plot(p, a, label=r'$\alpha$', color='#6082b6', # blue marker='d') plt.axis([1.0, len(e), 0.0, 1.5*max(a)]) plt.legend(loc='best', frameon=True) plt.grid(axis='y', alpha=1.0) # Add grid on y axis. plt.xlabel('time') # Set x axis label. plt.ylabel('privacy loss') # Set y axis label. plt.title(title) plt.show() ''' Plots a comparison of the privacy loss of all models. Parameters: title - The title of the plot. a - The privacy loss of the basic model. a_s - The privacy loss of the static model. a_e - The privacy loss of the exponential model. a_l - The privacy loss of the linear model. Returns: Nothing. ''' def cmp_loss(title, a, a_s, a_e, a_l): plt.rc('font', family='serif') plt.rc('font', size=10) plt.rc('text', usetex=True) p = np.arange(1.0, len(a) + 1, 1) plt.plot(p, a, label=r'$\alpha$', color='red', marker='s') plt.plot(p, a_s, label=r'$\alpha_s$', color='green', marker='>') plt.plot(p, a_e, label=r'$\alpha_e$', color='blue', marker='d') plt.plot(p, a_l, label=r'$\alpha_l$', color='yellow', marker='<') plt.axis([1.0, len(a), 0.0, 1.5*max(a)]) plt.legend(loc='best', frameon=True) plt.grid(axis='y', alpha=1.0) # Add grid on y axis. plt.xlabel('time') # Set x axis label. plt.ylabel('privacy loss') # Set y axis label. plt.title(title) plt.show() ''' Parse arguments. Mandatory: model, The model to be used: (b)asic, (s)tatic, (l)inear, (e)xponential. Optional: -c, --correlation, The correlation degree. -d, --debug, Enable debugging mode. -e, --epsilon, The available privacy budget. -m, --matrix, The size of the transition matrix. -o, --output, The path to the output directory. -t, --time, The time limit. -w, --window, The size of the event protection window. ''' def parse_args(): # Create argument parser. parser = argparse.ArgumentParser() # Mandatory arguments. parser.add_argument('model', help='The model to be used.', choices=['b', 's', 'l', 'e'], type=str) # Optional arguments. parser.add_argument('-c', '--correlation', help='The correlation degree.', type=float, default=0.1) parser.add_argument('-d', '--debug', help='Enable debugging mode.', action='store_true') parser.add_argument('-e', '--epsilon', help='The available privacy budget.', type=float, default=1.0) parser.add_argument('-m', '--matrix', help='The matrix size.', type=int, default=2) parser.add_argument('-o', '--output', help='The path to the output directory.', type=str, default='') parser.add_argument('-t', '--time', help='The time limit.', type=int, default=1) parser.add_argument('-w', '--window', help='The size of the event protection window.', type=int, default=1) # Parse arguments. args = parser.parse_args() return args def main(args): global DEBUG, MISS, TOTAL DEBUG = args.debug MISS, TOTAL = 0, 0 e = [args.epsilon]*args.time # Generate transition matrices. p_b = gen_trans_mt(args.matrix, args.correlation) p_f = gen_trans_mt(args.matrix, args.correlation) if DEBUG: # Run basic for comparison. a_b = bpl(p_b, [0.0] * args.time, e, args.time) a_f = fpl(p_f, [0.0] * args.time, e, args.time) a = tpl(a_b, a_f, e) output_path = args.output +\ '/' + args.model +\ '_c' + str(args.correlation) +\ '_e' + str(args.epsilon) +\ '_m' + str(args.matrix) +\ '_t' + str(args.time) +\ '_w' + str(args.window) if args.model == 'b': # Basic # start_time = time.process_time() a_b = bpl_m(p_b, [0.0] * args.time, e, args.time) a_f = fpl_m(p_f, [0.0] * args.time, e, args.time) a = tpl(a_b, a_f, e) # duration1 = time.process_time() - start_time if DEBUG: print('\Basic\n' 't\tbpl\t\tfpl\t\ttpl') for i in range(0, args.time): print('%d\t%f\t%f\t%f' %(i + 1, a_b[i], a_f[i], a[i])) print('\nSUM\t%f\t%f\t%f' %(sum(a_b), sum(a_f), sum(a))) print('AVG\t%f\t%f\t%f' %(sum(a_b)/args.time, sum(a_f)/args.time, sum(a)/args.time)) plot_loss('Basic', e, a_b, a_f, a) if args.Returns: save_output(output_path, args.time, e, a_b, a_f, a) # start_time = time.process_time() # a_b_m = bpl_m(p_b, [0.0] * args.time, e, args.time) # a_f_m = fpl_m(p_f, [0.0] * args.time, e, args.time) # a_m = tpl(a_b_m, a_f_m, e) # duration2 = time.process_time() - start_time # diff = ((duration2 - duration1)/duration1)*100 # success = safe_div((TOTAL - MISS)*100, TOTAL) # print('##############################\n' # 'basic : %.4fs\n' # 'w/ mem: %.4fs\n' # '- succ: %d/%d (%.2f%%)\n' # 'diff : %.4fs (%.2f%%)\n' # 'OK : %r' # %(duration1, # duration2, # TOTAL - MISS, TOTAL, success, # duration2 - duration1, diff, # np.array_equal(a, a_m))) # MISS, TOTAL = 0, 0 elif args.model == 's': # Static # start_time = time.process_time() a_s_b, a_s_f = [None]*args.time, [None]*args.time for i in range(1, args.time + 1): a_s_b[i - 1] = bpl_s_m(p_b, e, i, args.window) a_s_f[i - 1] = fpl_s_m(p_f, e, i, args.time, args.window) a_s = tpl(a_s_b, a_s_f, e) # duration1 = time.process_time() - start_time if DEBUG: print('\nStatic\n' 't\tbpl\t\t\tfpl\t\t\ttpl') for i in range(0, args.time): print('%d\t%f (%.2f%%)\t%f (%.2f%%)\t%f (%.2f%%)' %(i + 1, a_s_b[i], (a_s_b[i] - a_b[i])/a_b[i]*100, a_s_f[i], (a_s_f[i] - a_f[i])/a_f[i]*100, a_s[i], (a_s[i] - a[i])/a[i]*100)) print('\nSUM\t%f (%.2f%%)\t%f (%.2f%%)\t%f (%.2f%%)' %(sum(a_s_b), ((sum(a_s_b) - sum(a_b))/sum(a_b))*100, sum(a_s_f), ((sum(a_s_f) - sum(a_f))/sum(a_f))*100, sum(a_s), ((sum(a_s) - sum(a))/sum(a))*100)) print('AVG\t%f (%.2f%%)\t%f (%.2f%%)\t%f (%.2f%%)' %(sum(a_s_b)/args.time, ((sum(a_s_b)/args.time - sum(a_b)/args.time)/(sum(a_b)/args.time))*100, sum(a_s_f)/args.time, ((sum(a_s_f)/args.time - sum(a_f)/args.time)/(sum(a_f)/args.time))*100, sum(a_s)/args.time, ((sum(a_s)/args.time - sum(a)/args.time)/(sum(a)/args.time))*100)) plot_loss('Static', e, a_s_b, a_s_f, a_s) if args.Returns: save_output(output_path, args.time, e, a_s_b, a_s_f, a_s) # start_time = time.process_time() # a_s_b_m, a_s_f_m = [None]*args.time, [None]*args.time # for i in range(1, args.time + 1): # a_s_b_m[i - 1] = bpl_s_m(p_b, e, i, args.window) # a_s_f_m[i - 1] = fpl_s_m(p_f, e, i, args.time, args.window) # a_s_m = tpl(a_s_b_m, a_s_f_m, e) # duration2 = time.process_time() - start_time # diff = ((duration2 - duration1)/duration1)*100 # success = safe_div((TOTAL - MISS)*100, TOTAL) # print('##############################\n' # 'static: %.4fs\n' # 'w/ mem: %.4fs\n' # '- succ: %d/%d (%.2f%%)\n' # 'diff : %.4fs (%.2f%%)\n' # 'OK : %r' # %(duration1, # duration2, # TOTAL - MISS, TOTAL, success, # duration2 - duration1, diff, # np.array_equal(a_s, a_s_m))) # MISS, TOTAL = 0, 0 elif args.model == 'l': # Linear l = 1 # start_time = time.process_time() a_l_b, a_l_f = [None]*args.time, [None]*args.time for i in range(1, args.time + 1): a_l_b[i - 1] = bpl_l_m(p_b, e, i, args.window, l) a_l_f[i - 1] = fpl_l_m(p_f, e, i, args.time, args.window, l) a_l = tpl(a_l_b, a_l_f, e) # duration1 = time.process_time() - start_time if DEBUG: print('\nLinear\n' 't\tbpl\t\t\tfpl\t\t\ttpl') for i in range(0, args.time): print('%d\t%f (%.2f%%)\t%f (%.2f%%)\t%f (%.2f%%)' %(i + 1, a_l_b[i], (a_l_b[i] - a_b[i])/a_b[i]*100, a_l_f[i], (a_l_f[i] - a_f[i])/a_f[i]*100, a_l[i], (a_l[i] - a[i])/a[i]*100)) print('\nSUM\t%f (%.2f%%)\t%f (%.2f%%)\t%f (%.2f%%)' %(sum(a_l_b), ((sum(a_l_b) - sum(a_b))/sum(a_b))*100, sum(a_l_f), ((sum(a_l_f) - sum(a_f))/sum(a_f))*100, sum(a_l), ((sum(a_l) - sum(a))/sum(a))*100)) print('AVG\t%f (%.2f%%)\t%f (%.2f%%)\t%f (%.2f%%)' %(sum(a_l_b)/args.time, ((sum(a_l_b)/args.time - sum(a_b)/args.time)/(sum(a_b)/args.time))*100, sum(a_l_f)/args.time, ((sum(a_l_f)/args.time - sum(a_f)/args.time)/(sum(a_f)/args.time))*100, sum(a_l)/args.time, ((sum(a_l)/args.time - sum(a)/args.time)/(sum(a)/args.time))*100)) plot_loss('Linear', e, a_l_b, a_l_f, a_l) if args.Returns: save_output(output_path, args.time, e, a_l_b, a_l_f, a_l) # start_time = time.process_time() # a_l_b_m, a_l_f_m = [None]*args.time, [None]*args.time # for i in range(1, args.time + 1): # a_l_b_m[i - 1] = bpl_l_m(p_b, e, i, args.window, l) # a_l_f_m[i - 1] = fpl_l_m(p_f, e, i, args.time, args.window, l) # a_l_m = tpl(a_l_b_m, a_l_f_m, e) # duration2 = time.process_time() - start_time # diff = ((duration2 - duration1)/duration1)*100 # success = safe_div((TOTAL - MISS)*100, TOTAL) # print('##############################\n' # 'linear: %.4fs\n' # 'w/ mem: %.4fs\n' # '- succ: %d/%d (%.2f%%)\n' # 'diff : %.4fs (%.2f%%)\n' # 'OK : %r' # %(duration1, # duration2, # TOTAL - MISS, TOTAL, success, # duration2 - duration1, diff, # np.array_equal(a_l, a_l_m))) # MISS, TOTAL = 0, 0 elif args.model == 'e': # Exponential h = 1 # start_time = time.process_time() a_e_b, a_e_f = [None]*args.time, [None]*args.time for i in range(1, args.time + 1): a_e_b[i - 1] = bpl_e_m(p_b, e, i, args.window, h) a_e_f[i - 1] = fpl_e_m(p_f, e, i, args.time, args.window, h) a_e = tpl(a_e_b, a_e_f, e) # duration1 = time.process_time() - start_time if DEBUG: print('\nExponential\n' 't\tbpl\t\t\tfpl\t\t\ttpl') for i in range(0, args.time): print('%d\t%f (%.2f%%)\t%f (%.2f%%)\t%f (%.2f%%)' %(i + 1, a_e_b[i], (a_e_b[i] - a_b[i])/a_b[i]*100, a_e_f[i], (a_e_f[i] - a_f[i])/a_f[i]*100, a_e[i], (a_e[i] - a[i])/a[i]*100)) print('\nSUM\t%f (%.2f%%)\t%f (%.2f%%)\t%f (%.2f%%)' %(sum(a_e_b), ((sum(a_e_b) - sum(a_b))/sum(a_b))*100, sum(a_e_f), ((sum(a_e_f) - sum(a_f))/sum(a_f))*100, sum(a_e), ((sum(a_e) - sum(a))/sum(a))*100)) print('AVG\t%f (%.2f%%)\t%f (%.2f%%)\t%f (%.2f%%)' %(sum(a_e_b)/args.time, ((sum(a_e_b)/args.time - sum(a_b)/args.time)/(sum(a_b)/args.time))*100, sum(a_e_f)/args.time, ((sum(a_e_f)/args.time - sum(a_f)/args.time)/(sum(a_f)/args.time))*100, sum(a_e)/args.time, ((sum(a_e)/args.time - sum(a)/args.time)/(sum(a)/args.time))*100)) plot_loss('Exponential', e, a_e_b, a_e_f, a_e) if args.Returns: save_output(output_path, args.time, e, a_e_b, a_e_f, a_e) # start_time = time.process_time() # a_e_b_m, a_e_f_m = [None]*args.time, [None]*args.time # for i in range(1, args.time + 1): # a_e_b_m[i - 1] = bpl_e_m(p_b, e, i, args.window, h) # a_e_f_m[i - 1] = fpl_e_m(p_f, e, i, args.time, args.window, h) # a_e_m = tpl(a_e_b_m, a_e_f_m, e) # duration2 = time.process_time() - start_time # diff = ((duration2 - duration1)/duration1)*100 # success = safe_div((TOTAL - MISS)*100, TOTAL) # print('##############################\n' # 'exp : %.4fs\n' # 'w/ mem: %.4fs\n' # '- succ: %d/%d (%.2f%%)\n' # 'diff : %.4fs (%.2f%%)\n' # 'OK : %r' # %(duration1, # duration2, # TOTAL - MISS, TOTAL, success, # duration2 - duration1, diff, # np.array_equal(a_e, a_e_m))) # MISS, TOTAL = 0, 0 # if DEBUG: # cmp_loss('Backward', a_b, a_s_b, a_l_b, a_e_b) # cmp_loss('Forward', a_f, a_s_f, a_l_f, a_e_f) # cmp_loss('Total', a, a_s, a_l, a_e) if __name__ == '__main__': try: args = parse_args() print('##############################\n' 'Correlation: %f\n' 'Debug : %r\n' 'Epsilon : %f\n' 'Matrix : %d\n' 'Model : %s\n' 'Output : %s\n' 'Time : %d\n' 'Window : %d\n' '##############################' %(args.correlation, args.debug, args.epsilon, args.matrix, args.model, args.output, args.time, args.window), flush=True) start_time = time.process_time() main(args) end_time = time.process_time() print('##############################\n' 'Time : %.4fs\n' '##############################\n' %(end_time - start_time), flush=True) except KeyboardInterrupt: print('Interrupted by user.') exit()