diff --git a/code/lib/gdp.py b/code/lib/gdp.py index 8467e28..52da294 100644 --- a/code/lib/gdp.py +++ b/code/lib/gdp.py @@ -28,15 +28,15 @@ MISS = 0 # Number of additions to the cache. TOTAL = 0 # Number of cache accesses. -''' - Read data from a file. - - Parameters: - path - The relative path to the data file. - Returns: - data - A list of tuples [uid, timestamp, lng, lat, loc]. -''' def load_data(path): + ''' + Read data from a file. + + Parameters: + path - The relative path to the data file. + Returns: + data - A list of tuples [uid, timestamp, lng, lat, loc]. + ''' print('Loading data from', os.path.abspath(path), '... ', end='') data = [] try: @@ -50,20 +50,20 @@ def load_data(path): exit() -''' - Save output to a file. - - Parameters: - path - The relative path to the output file. - t - The number of timestamps. - e - The privacy budget at each timestamp. - a_b - The backward privacy loss at each timestamp. - a_f - The forward privacy loss at each timestamp. - a - The temporal privacy loss at each timestamp. - Returns: - Nothing. -''' def save_output(path, t, e, a_b, a_f, a): + ''' + Save output to a file. + + Parameters: + path - The relative path to the output file. + t - The number of timestamps. + e - The privacy budget at each timestamp. + a_b - The backward privacy loss at each timestamp. + a_f - The forward privacy loss at each timestamp. + a - The temporal privacy loss at each timestamp. + Returns: + Nothing. + ''' # timestamp = time.strftime('%Y%m%d%H%M%S') print('Saving output to %s... ' %(path), end='', flush=True) os.makedirs(os.path.dirname(path), exist_ok=True) @@ -74,15 +74,15 @@ def save_output(path, t, e, a_b, a_f, a): print('OK.', flush=True) -''' - Get all the timestamps from the input data. - - Parameters: - data - The input data set. - Returns: - timestamps - An ndarray of all of the timestamps from the input data. -''' def get_timestamps(data): + ''' + Get all the timestamps from the input data. + + Parameters: + data - The input data set. + Returns: + timestamps - An ndarray of all of the timestamps from the input data. + ''' print('Getting a list of all timestamps... ', end='', flush=True) timestamps = np.sort(np.unique(np.array(data)[:, 1])) if not len(timestamps): @@ -103,15 +103,15 @@ def get_timestamps(data): return timestamps -''' - Get all the unique locations from the input data. - - Parameters: - data - The input data set. - Returns: - locs - A sorted ndarray of all the unique locations int the input data. -''' def get_locs(data): + ''' + Get all the unique locations from the input data. + + Parameters: + data - The input data set. + Returns: + locs - A sorted ndarray of all the unique locations int the input data. + ''' print('Getting a list of all locations... ', end='', flush=True) locs = np.sort(np.unique(np.array(data)[:, 4].astype(np.int))) if not len(locs): @@ -123,16 +123,16 @@ def get_locs(data): return list(map(str, locs)) -''' - Get the counts at every location for a specific timestamp. - - Parameters: - data - The input data set. - t - The timestamp of interest. - Returns: - cnts - A dict {loc:cnt} with the counts at every location for a specific timestamp. -''' def get_cnts(data, t): + ''' + Get the counts at every location for a specific timestamp. + + Parameters: + data - The input data set. + t - The timestamp of interest. + Returns: + cnts - A dict {loc:cnt} with the counts at every location for a specific timestamp. + ''' print('Getting all counts at %s... ' %(t), end='', flush=True) locs = get_locs(data) cnts = dict.fromkeys(locs, 0) @@ -145,15 +145,15 @@ def get_cnts(data, t): return cnts -''' - Get the counts at every location for every timestamp. - - Parameters: - data - The input data set. - Returns: - cnts - A dict {timestamp:loc} with all the counts at every location for every timestamp. -''' def get_all_cnts(data): + ''' + Get the counts at every location for every timestamp. + + Parameters: + data - The input data set. + Returns: + cnts - A dict {timestamp:loc} with all the counts at every location for every timestamp. + ''' cnts = {} for d in data: key = d[1] + '@' + d[4] @@ -163,15 +163,15 @@ def get_all_cnts(data): return cnts -''' - Get a list of unique users in the input data set. - - Parameters: - data - The input data set. - Returns: - users - An ndarray of all unique users. -''' def get_usrs(data): + ''' + Get a list of unique users in the input data set. + + Parameters: + data - The input data set. + Returns: + users - An ndarray of all unique users. + ''' users = np.sort(np.unique(np.array(data)[:, 0].astype(np.int))) if not len(users): print('No users found.') @@ -181,16 +181,16 @@ def get_usrs(data): return users -''' - Get the data of a particular user from a data set. - - Parameters: - data - The input data set. - id - The user identifier. - Returns: - output - A list of the data of the targeted user. -''' def get_usr_data(data, id): + ''' + Get the data of a particular user from a data set. + + Parameters: + data - The input data set. + id - The user identifier. + Returns: + output - A list of the data of the targeted user. + ''' output = [] for d in data: if (d[0] == str(id)): @@ -200,30 +200,30 @@ def get_usr_data(data, id): return output -''' - Get the data of every user in a data set. - - Parameters: - data - The input data set. - Returns: - output - A dict {usr, [usr_data]} with the data of each user. -''' def get_usrs_data(data): + ''' + Get the data of every user in a data set. + + Parameters: + data - The input data set. + Returns: + output - A dict {usr, [usr_data]} with the data of each user. + ''' output = {} for d in data: output[d[0]] = output.get(d[0], []) + [d] return output -''' - Get the trajectory of a user from her data. - - Parameters: - data - The data of the user. - Returns: - traj - A list [(timestamp, loc)] with the locations and corresponding timestamps that the user was at. -''' def get_usr_traj(data): + ''' + Get the trajectory of a user from her data. + + Parameters: + data - The data of the user. + Returns: + traj - A list [(timestamp, loc)] with the locations and corresponding timestamps that the user was at. + ''' traj = [] for d in data: traj.append((d[1], d[4])) @@ -232,15 +232,15 @@ def get_usr_traj(data): return traj -''' - Get all the possible transitions. - - Parameters: - data - The input data set. - Returns: - trans - A set with all the possible forward transitions in the input. -''' def get_poss_trans(data): + ''' + Get all the possible transitions. + + Parameters: + data - The input data set. + Returns: + trans - A set with all the possible forward transitions in the input. + ''' print('Getting possible transitions... ', end='', flush=True) trans = set() for u, u_data in data.items(): @@ -253,16 +253,16 @@ def get_poss_trans(data): return trans -''' - Get all backward transitions in a data set. - - Parameters: - data - The input data set. - Returns: - trans - A dict {(t, t-1):[transitions]} with all the backward transitions - at every sequential timestamp pair in the input data set. -''' def get_bwd_trans(data): + ''' + Get all backward transitions in a data set. + + Parameters: + data - The input data set. + Returns: + trans - A dict {(t, t-1):[transitions]} with all the backward transitions + at every sequential timestamp pair in the input data set. + ''' print('Getting all backward transitions... ', end='', flush=True) trans = {} for u, u_data in data.items(): @@ -276,16 +276,16 @@ def get_bwd_trans(data): return trans -''' - Get all forward transitions in a data set. - - Parameters: - data - The input data set. - Returns: - trans - A dict {(t-1, t):[transitions]} with all the forward transitions - at every sequential timestamp pair in the input data set. -''' def get_fwd_trans(data): + ''' + Get all forward transitions in a data set. + + Parameters: + data - The input data set. + Returns: + trans - A dict {(t-1, t):[transitions]} with all the forward transitions + at every sequential timestamp pair in the input data set. + ''' print('Getting all forward transitions... ', end='', flush=True) trans = {} for u, u_data in data.items(): @@ -299,53 +299,53 @@ def get_fwd_trans(data): return trans -''' - Divide two numbers. If the divisor is 0 return inf. - - Parameters: - a - The dividend. - b - The divisor. - Returns: - The float result of the division. -''' def safe_div(a, b): + ''' + Divide two numbers. If the divisor is 0 return inf. + + Parameters: + a - The dividend. + b - The divisor. + Returns: + The float result of the division. + ''' if b == 0: return math.inf return float(a/b) -''' - Calculate the maximum value of the objective function. - - Parameters: - q - A row from the transition matrix. - d - Another row from the transition matrix. - a - The backward/forward privacy loss of the previous/next - timestamp. - Returns: - The maximum value of the objective function. -''' def max_val(q, d, a): + ''' + Calculate the maximum value of the objective function. + + Parameters: + q - A row from the transition matrix. + d - Another row from the transition matrix. + a - The backward/forward privacy loss of the previous/next + timestamp. + Returns: + The maximum value of the objective function. + ''' if a == math.inf: return math.nan return (q*(math.exp(a) - 1) + 1)/(d*(math.exp(a) - 1) + 1) -''' - Find two different rows (q and d) of a transition matrix (p) - that maximize the product of the objective function and return - their sums. - - Parameters: - p - The transition matrix representing the backward/forward - correlations. - a - The backward/forward privacy loss of the previous/next - timestamp. - Returns: - sum_q - The sum of the elements of q. - sum_d - The sum of the elements of d. -''' def find_qd(p, a): + ''' + Find two different rows (q and d) of a transition matrix (p) + that maximize the product of the objective function and return + their sums. + + Parameters: + p - The transition matrix representing the backward/forward + correlations. + a - The backward/forward privacy loss of the previous/next + timestamp. + Returns: + sum_q - The sum of the elements of q. + sum_d - The sum of the elements of d. + ''' res = 0.0 sum_q, sum_d = 0.0, 0.0 for q in p: # A row from the transition matrix. @@ -374,17 +374,17 @@ def find_qd(p, a): return sum_q, sum_d -''' - Generate data. - - Parameters: - usrs - The number of users. - timestamps - The number of timestamps. - locs - The numner of locations. - Returns: - data - The generated data. -''' def gen_data(usrs, timestamps, locs): + ''' + Generate data. + + Parameters: + usrs - The number of users. + timestamps - The number of timestamps. + locs - The numner of locations. + Returns: + data - The generated data. + ''' print('Generating data... ', end='', flush=True) # Generate timestamps. ts = [] @@ -412,18 +412,18 @@ def gen_data(usrs, timestamps, locs): return data -''' - Generate a transition matrix. - - Parameters: - n - The dimension of the matrix. - s - The correlation degree of each row [0, 1]. - The lower its value, the lower the degree of - uniformity of each row. - Returns: - p_ - The transition matrix. -''' def gen_trans_mt(n, s): + ''' + Generate a transition matrix. + + Parameters: + n - The dimension of the matrix. + s - The correlation degree of each row [0, 1]. + The lower its value, the lower the degree of + uniformity of each row. + Returns: + p_ - The transition matrix. + ''' if DEBUG: print('Generating transition matrix %dx%d with s = %.4f... ' %(n, n, s), end='', flush=True) p = np.zeros((n, n), float) @@ -439,17 +439,17 @@ def gen_trans_mt(n, s): return p_ -''' - Get the transition matrix - - Parameters: - locs - A list of all the locations. - trans - A list of all transitions. - Returns: - p - A 2d dict {{locs}{locs}} containing the - corresponding location transition probabilities. -''' def get_trans_mt(locs, trans): + ''' + Get the transition matrix + + Parameters: + locs - A list of all the locations. + trans - A list of all transitions. + Returns: + p - A 2d dict {{locs}{locs}} containing the + corresponding location transition probabilities. + ''' if DEBUG: print('Generating the transition matrix... ', end='', flush=True) # Initialize the transition matrix. @@ -476,16 +476,16 @@ def get_trans_mt(locs, trans): return p -''' - Calculate the measure-theoretic (Kolmogorov-Sinai) entropy - of a transition matrix. - - Parameters: - mt - A 2d dict transition matrix. - Returns: - h - The Kolmogorov-Sinai entropy of the matrix. -''' def get_entropy(mt): + ''' + Calculate the measure-theoretic (Kolmogorov-Sinai) entropy + of a transition matrix. + + Parameters: + mt - A 2d dict transition matrix. + Returns: + h - The Kolmogorov-Sinai entropy of the matrix. + ''' if DEBUG: print('Calculating the measure-theoretic entropy... ', end='', flush=True) h = 0.0 @@ -523,15 +523,15 @@ def get_entropy(mt): return h -''' - Convert a 2d dict to a 2d array. - - Parameters: - mt - The 2d dict. - Returns: - p - The 2d numpy array. -''' def get_2darray(mt): + ''' + Convert a 2d dict to a 2d array. + + Parameters: + mt - The 2d dict. + Returns: + p - The 2d numpy array. + ''' if type(mt) == type(np.array([])): return mt p = np.zeros((len(mt), len(mt)), float) @@ -540,51 +540,51 @@ def get_2darray(mt): return p -''' - Get a Laplace probability distribution. - - Parameters: - ts - The points of the distribution. - t - The location of the distribution. - sc - The scale of the distribution. - Returns: - The probability distribution. -''' def get_laplace_pd(ts, t, sc): + ''' + Get a Laplace probability distribution. + + Parameters: + ts - The points of the distribution. + t - The location of the distribution. + sc - The scale of the distribution. + Returns: + The probability distribution. + ''' x = np.arange(0, len(ts), 1) loc = np.where(ts == t) return laplace.pdf(x, loc=loc, scale=sc)[0] -''' - Get a Gaussian probability distribution. - - Parameters: - ts - The points of the distribution. - t - The location of the distribution. - sc - The scale of the distribution. - Returns: - The probability distribution. -''' def get_norm_pd(ts, t, sc): + ''' + Get a Gaussian probability distribution. + + Parameters: + ts - The points of the distribution. + t - The location of the distribution. + sc - The scale of the distribution. + Returns: + The probability distribution. + ''' x = np.arange(0, len(ts), 1) loc = np.where(ts == t) return norm.pdf(x, loc=loc, scale=sc)[0] -''' - Get a sample from the time series. - - Parameters: - ts - An ndarray of the timestamps. - t - The current timestamp. - pd - The probability distribution. - ptn - The desired portion [0, 1] of the non-zero elements - of the probability distribution to be sampled. - Returns: - spl - An ndarray of the sampled timestamps. -''' def get_sample(ts, t, pct, pd): + ''' + Get a sample from the time series. + + Parameters: + ts - An ndarray of the timestamps. + t - The current timestamp. + pd - The probability distribution. + ptn - The desired portion [0, 1] of the non-zero elements + of the probability distribution to be sampled. + Returns: + spl - An ndarray of the sampled timestamps. + ''' if DEBUG: print('Sampling %.2f%% of %s at %s... ' %(pct*100, ts, t), end='', flush=True) # Check that it is a valid timestamp. @@ -604,38 +604,38 @@ def get_sample(ts, t, pct, pd): return spl -''' - Calculate the backward/forward privacy loss at the current - timestamp. - - Parameters: - p - The transition matrix representing the backward/forward - temporal correlations. - a - The privacy loss of the previous/next timestamp. - e - The privacy budget for data publishing. - Returns: - The backward/forward privacy loss at the current - timestamp. -''' def priv_l(p, a, e): + ''' + Calculate the backward/forward privacy loss at the current + timestamp. + + Parameters: + p - The transition matrix representing the backward/forward + temporal correlations. + a - The privacy loss of the previous/next timestamp. + e - The privacy budget for data publishing. + Returns: + The backward/forward privacy loss at the current + timestamp. + ''' sum_q, sum_d = find_qd(p, a) return math.log(max_val(sum_q, sum_d, a)) + e -''' - Calculate the backward/forward privacy loss at the current - timestamp using memoization. - - Parameters: - p - The transition matrix representing the backward/forward - temporal correlations. - a - The privacy loss of the previous/next timestamp. - e - The privacy budget for data publishing. - Returns: - The backward/forward privacy loss at the current - timestamp. -''' def priv_l_m(p, a, e): + ''' + Calculate the backward/forward privacy loss at the current + timestamp using memoization. + + Parameters: + p - The transition matrix representing the backward/forward + temporal correlations. + a - The privacy loss of the previous/next timestamp. + e - The privacy budget for data publishing. + Returns: + The backward/forward privacy loss at the current + timestamp. + ''' key = xxhash.xxh64(p).hexdigest() + str(a) + str(e) global MEM, TOTAL, MISS TOTAL += 1 @@ -648,47 +648,48 @@ def priv_l_m(p, a, e): return result -''' - Calculate the total backward privacy loss at every timestamp. - - Parameters: - p - The transition matrix representing the backward - temporal correlations. - a - The backward privacy loss of every release. - e - The privacy budget for data publishing. - t - The time limit. - Returns: - a - The backward privacy loss at every timestamp - due to the previous data releases. -''' def bpl(p, a, e, t): + ''' + Calculate the total backward privacy loss at every timestamp. + + Parameters: + p - The transition matrix representing the backward + temporal correlations. + a - The backward privacy loss of every release. + e - The privacy budget for data publishing. + t - The time limit. + Returns: + a - The backward privacy loss at every timestamp + due to the previous data releases. + ''' a[0] = e[0] for i in range(1, t): a[i] = priv_l(p, a[i - 1], e[i]) return a -''' - Calculate the total backward privacy loss at the current - timestamp with memoization. - - Parameters: - p - The transition matrix representing the backward - temporal correlations. - a - The backward privacy loss of the current release - at all previous timestamps. - e - The privacy budget for data publishing. - t - The time limit. - Returns: - a - The backward privacy loss at the current timestamp - due to the previous data releases. -''' def bpl_m(p, a, e, t): + ''' + Calculate the total backward privacy loss at the current + timestamp with memoization. + + Parameters: + p - The transition matrix representing the backward + temporal correlations. + a - The backward privacy loss of the current release + at all previous timestamps. + e - The privacy budget for data publishing. + t - The time limit. + Returns: + a - The backward privacy loss at the current timestamp + due to the previous data releases. + ''' a[0] = e[0] for i in range(1, t): a[i] = priv_l_m(p, a[i - 1], e[i]) return a + def bpl_lmdk_mem(p, a, e, t, lmdk): # t is (near) the landmark if lmdk == t - 1 or t == lmdk: @@ -702,22 +703,22 @@ def bpl_lmdk_mem(p, a, e, t, lmdk): return a -''' - Calculate the total backward privacy loss at the current - timestamp using the static model, i.e., previous releases - are grouped in a window of static size. - - Parameters: - p - The transition matrix representing the backward - temporal correlations. - e - The privacy budget for data publishing. - i - The timestamp of interest. - w - The window size to group previous releases. - Returns: - a - The backward privacy loss at the current timestamp - due to the previous data releases. -''' def bpl_s(p, e, i, w): + ''' + Calculate the total backward privacy loss at the current + timestamp using the static model, i.e., previous releases + are grouped in a window of static size. + + Parameters: + p - The transition matrix representing the backward + temporal correlations. + e - The privacy budget for data publishing. + i - The timestamp of interest. + w - The window size to group previous releases. + Returns: + a - The backward privacy loss at the current timestamp + due to the previous data releases. + ''' if i - w > 1: # print('bpl_s: %d - %d [%d]' %(i, i - w, w)) return priv_l(np.linalg.matrix_power(p, w), bpl_s(p, e, i - w, w), e[i - 1]) @@ -729,22 +730,22 @@ def bpl_s(p, e, i, w): return e[0] -''' - Calculate the total backward privacy loss at the current - timestamp using the static model, i.e., previous releases - are grouped in a window of static size, using memoization. - - Parameters: - p - The transition matrix representing the backward - temporal correlations. - e - The privacy budget for data publishing. - i - The timestamp of interest. - w - The window size to group previous releases. - Returns: - a - The backward privacy loss at the current timestamp - due to the previous data releases. -''' def bpl_s_m(p, e, i, w): + ''' + Calculate the total backward privacy loss at the current + timestamp using the static model, i.e., previous releases + are grouped in a window of static size, using memoization. + + Parameters: + p - The transition matrix representing the backward + temporal correlations. + e - The privacy budget for data publishing. + i - The timestamp of interest. + w - The window size to group previous releases. + Returns: + a - The backward privacy loss at the current timestamp + due to the previous data releases. + ''' if i - w > 1: return priv_l_m(np.linalg.matrix_power(p, w), bpl_s_m(p, e, i - w, w), e[i - 1]) elif i - w <= 1: @@ -753,24 +754,24 @@ def bpl_s_m(p, e, i, w): return e[0] -''' - Calculate the total backward privacy loss at the current - timestamp using the linear model, i.e., previous releases - are grouped in a window of a size that increases linearly. - - Parameters: - p - The transition matrix representing the backward - temporal correlations. - e - The privacy budget for data publishing. - i - The timestamp of interest. - w - The window size to group previous releases. - l - The linearly increasing coefficient that affects the - window size. - Returns: - a - The backward privacy loss at the current timestamp - due to the previous data releases. -''' def bpl_l(p, e, i, w, l): + ''' + Calculate the total backward privacy loss at the current + timestamp using the linear model, i.e., previous releases + are grouped in a window of a size that increases linearly. + + Parameters: + p - The transition matrix representing the backward + temporal correlations. + e - The privacy budget for data publishing. + i - The timestamp of interest. + w - The window size to group previous releases. + l - The linearly increasing coefficient that affects the + window size. + Returns: + a - The backward privacy loss at the current timestamp + due to the previous data releases. + ''' if i - w*l > 1: # print('bpl_l: %d - %d [%d]' %(i, i - w*l, w*l)) return priv_l(np.linalg.matrix_power(p, w*l), bpl_l(p, e, i - w*l, w, l + 1), e[i - 1]) @@ -782,25 +783,25 @@ def bpl_l(p, e, i, w, l): return e[0] -''' - Calculate the total backward privacy loss at the current - timestamp using the linear model, i.e., previous releases - are grouped in a window of a size that increases linearly, - using memoization. - - Parameters: - p - The transition matrix representing the backward - temporal correlations. - e - The privacy budget for data publishing. - i - The timestamp of interest. - w - The window size to group previous releases. - l - The linearly increasing coefficient that affects the - window size. - Returns: - a - The backward privacy loss at the current timestamp - due to the previous data releases. -''' def bpl_l_m(p, e, i, w, l): + ''' + Calculate the total backward privacy loss at the current + timestamp using the linear model, i.e., previous releases + are grouped in a window of a size that increases linearly, + using memoization. + + Parameters: + p - The transition matrix representing the backward + temporal correlations. + e - The privacy budget for data publishing. + i - The timestamp of interest. + w - The window size to group previous releases. + l - The linearly increasing coefficient that affects the + window size. + Returns: + a - The backward privacy loss at the current timestamp + due to the previous data releases. + ''' if i - w*l > 1: return priv_l_m(np.linalg.matrix_power(p, w*l), bpl_l_m(p, e, i - w*l, w, l + 1), e[i - 1]) elif i - w*l <= 1: @@ -809,24 +810,24 @@ def bpl_l_m(p, e, i, w, l): return e[0] -''' - Calculate the total backward privacy loss at the current - timestamp using the exponential model, i.e., previous releases - are grouped in a window of a size that increases exponentially. - - Parameters: - p - The transition matrix representing the backward - temporal correlations. - e - The privacy budget for data publishing. - i - The timestamp of interest. - w - The window size to group previous releases. - h - The exponentially increasing coefficient that affects - the window size. - Returns: - a - The backward privacy loss at the current timestamp - due to the previous data releases. -''' def bpl_e(p, e, i, w, h): + ''' + Calculate the total backward privacy loss at the current + timestamp using the exponential model, i.e., previous releases + are grouped in a window of a size that increases exponentially. + + Parameters: + p - The transition matrix representing the backward + temporal correlations. + e - The privacy budget for data publishing. + i - The timestamp of interest. + w - The window size to group previous releases. + h - The exponentially increasing coefficient that affects + the window size. + Returns: + a - The backward privacy loss at the current timestamp + due to the previous data releases. + ''' if i - w**h > 1: # print('bpl_e: %d - %d [%d]' %(i, i - w**h, w**h)) return priv_l(np.linalg.matrix_power(p, w**h), bpl_e(p, e, i - w**h, w, h + 1), e[i - 1]) @@ -838,25 +839,25 @@ def bpl_e(p, e, i, w, h): return e[0] -''' - Calculate the total backward privacy loss at the current - timestamp using the exponential model, i.e., previous releases - are grouped in a window of a size that increases exponentially, - using memoization. - - Parameters: - p - The transition matrix representing the backward - temporal correlations. - e - The privacy budget for data publishing. - i - The timestamp of interest. - w - The window size to group previous releases. - h - The exponentially increasing coefficient that affects - the window size. - Returns: - a - The backward privacy loss at the current timestamp - due to the previous data releases. -''' def bpl_e_m(p, e, i, w, h): + ''' + Calculate the total backward privacy loss at the current + timestamp using the exponential model, i.e., previous releases + are grouped in a window of a size that increases exponentially, + using memoization. + + Parameters: + p - The transition matrix representing the backward + temporal correlations. + e - The privacy budget for data publishing. + i - The timestamp of interest. + w - The window size to group previous releases. + h - The exponentially increasing coefficient that affects + the window size. + Returns: + a - The backward privacy loss at the current timestamp + due to the previous data releases. + ''' if i - w**h > 1: return priv_l_m(np.linalg.matrix_power(p, w**h), bpl_e_m(p, e, i - w**h, w, h + 1), e[i - 1]) elif i - w**h <= 1: @@ -865,44 +866,44 @@ def bpl_e_m(p, e, i, w, h): return e[0] -''' - Calculate the total forward privacy loss at the current - timestamp. - - Parameters: - p - The transition matrix representing the forward - temporal correlations. - a - The forward privacy loss of the current release - at all next timestamps. - e - The privacy budget for data publishing. - t - The time limit. - Returns: - a - The forward privacy loss at the current timestamp - due to the next data releases. -''' def fpl(p, a, e, t): + ''' + Calculate the total forward privacy loss at the current + timestamp. + + Parameters: + p - The transition matrix representing the forward + temporal correlations. + a - The forward privacy loss of the current release + at all next timestamps. + e - The privacy budget for data publishing. + t - The time limit. + Returns: + a - The forward privacy loss at the current timestamp + due to the next data releases. + ''' a[t - 1] = e[t - 1] for i in range(t - 2, -1, -1): a[i] = priv_l(p, a[i + 1], e[i]) return a -''' - Calculate the total forward privacy loss at the current - timestamp, using memoization. - - Parameters: - p - The transition matrix representing the forward - temporal correlations. - a - The forward privacy loss of the current release - at all next timestamps. - e - The privacy budget for data publishing. - t - The time limit. - Returns: - a - The forward privacy loss at the current timestamp - due to the next data releases. -''' def fpl_m(p, a, e, t): + ''' + Calculate the total forward privacy loss at the current + timestamp, using memoization. + + Parameters: + p - The transition matrix representing the forward + temporal correlations. + a - The forward privacy loss of the current release + at all next timestamps. + e - The privacy budget for data publishing. + t - The time limit. + Returns: + a - The forward privacy loss at the current timestamp + due to the next data releases. + ''' a[t - 1] = e[t - 1] for i in range(t - 2, -1, -1): a[i] = priv_l_m(p, a[i + 1], e[i]) @@ -921,22 +922,22 @@ def fpl_lmdk_mem(p, a, e, t, lmdk): return a -''' - Calculate the total forward privacy loss at the current - timestamp using the static model, i.e., next releases - are grouped in a window of static size. - - Parameters: - p - The transition matrix representing the forward - temporal correlations. - e - The privacy budget for data publishing. - i - The timestamp of interest. - w - The window size to group next releases. - Returns: - a - The forward privacy loss at the current timestamp - due to the next data releases. -''' def fpl_s(p, e, i, t, w): + ''' + Calculate the total forward privacy loss at the current + timestamp using the static model, i.e., next releases + are grouped in a window of static size. + + Parameters: + p - The transition matrix representing the forward + temporal correlations. + e - The privacy budget for data publishing. + i - The timestamp of interest. + w - The window size to group next releases. + Returns: + a - The forward privacy loss at the current timestamp + due to the next data releases. + ''' if i + w < t: # print('fpl_s: %d - %d [%d]' %(i, i + w, w)) return priv_l(np.linalg.matrix_power(p, w), fpl_s(p, e, i + w, t, w), e[i - 1]) @@ -948,22 +949,22 @@ def fpl_s(p, e, i, t, w): return e[t - 1] -''' - Calculate the total forward privacy loss at the current - timestamp using the static model, i.e., next releases - are grouped in a window of static size, using memoization. - - Parameters: - p - The transition matrix representing the forward - temporal correlations. - e - The privacy budget for data publishing. - i - The timestamp of interest. - w - The window size to group next releases. - Returns: - a - The forward privacy loss at the current timestamp - due to the next data releases. -''' def fpl_s_m(p, e, i, t, w): + ''' + Calculate the total forward privacy loss at the current + timestamp using the static model, i.e., next releases + are grouped in a window of static size, using memoization. + + Parameters: + p - The transition matrix representing the forward + temporal correlations. + e - The privacy budget for data publishing. + i - The timestamp of interest. + w - The window size to group next releases. + Returns: + a - The forward privacy loss at the current timestamp + due to the next data releases. + ''' if i + w < t: return priv_l_m(np.linalg.matrix_power(p, w), fpl_s_m(p, e, i + w, t, w), e[i - 1]) elif i + w >= t: @@ -972,24 +973,24 @@ def fpl_s_m(p, e, i, t, w): return e[t - 1] -''' - Calculate the total forward privacy loss at the current - timestamp using the linear model, i.e., next releases - are grouped in a window of a size that increases linearly. - - Parameters: - p - The transition matrix representing the forward - temporal correlations. - e - The privacy budget for data publishing. - i - The timestamp of interest. - w - The window size to group next releases. - l - The linearly increasing coefficient that affects the - window size. - Returns: - a - The forward privacy loss at the current timestamp - due to the next data releases. -''' def fpl_l(p, e, i, t, w, l): + ''' + Calculate the total forward privacy loss at the current + timestamp using the linear model, i.e., next releases + are grouped in a window of a size that increases linearly. + + Parameters: + p - The transition matrix representing the forward + temporal correlations. + e - The privacy budget for data publishing. + i - The timestamp of interest. + w - The window size to group next releases. + l - The linearly increasing coefficient that affects the + window size. + Returns: + a - The forward privacy loss at the current timestamp + due to the next data releases. + ''' if i + w*l < t: # print('fpl_l: %d - %d [%d]' %(i, i + w*l, w*l)) return priv_l(np.linalg.matrix_power(p, w*l), fpl_l(p, e, i + w*l, t, w, l + 1), e[i - 1]) @@ -1001,25 +1002,25 @@ def fpl_l(p, e, i, t, w, l): return e[t - 1] -''' - Calculate the total forward privacy loss at the current - timestamp using the linear model, i.e., next releases - are grouped in a window of a size that increases linearly, - using memoization. - - Parameters: - p - The transition matrix representing the forward - temporal correlations. - e - The privacy budget for data publishing. - i - The timestamp of interest. - w - The window size to group next releases. - l - The linearly increasing coefficient that affects the - window size. - Returns: - a - The forward privacy loss at the current timestamp - due to the next data releases. -''' def fpl_l_m(p, e, i, t, w, l): + ''' + Calculate the total forward privacy loss at the current + timestamp using the linear model, i.e., next releases + are grouped in a window of a size that increases linearly, + using memoization. + + Parameters: + p - The transition matrix representing the forward + temporal correlations. + e - The privacy budget for data publishing. + i - The timestamp of interest. + w - The window size to group next releases. + l - The linearly increasing coefficient that affects the + window size. + Returns: + a - The forward privacy loss at the current timestamp + due to the next data releases. + ''' if i + w*l < t: return priv_l_m(np.linalg.matrix_power(p, w*l), fpl_l_m(p, e, i + w*l, t, w, l + 1), e[i - 1]) elif i + w*l >= t: @@ -1028,24 +1029,24 @@ def fpl_l_m(p, e, i, t, w, l): return e[t - 1] -''' - Calculate the total forward privacy loss at the current - timestamp using the exponential model, i.e., next releases - are grouped in a window of a size that increases exponentially. - - Parameters: - p - The transition matrix representing the forward - temporal correlations. - e - The privacy budget for data publishing. - i - The timestamp of interest. - w - The window size to group next releases. - h - The exponentially increasing coefficient that affects - the window size. - Returns: - a - The forward privacy loss at the current timestamp - due to the next data releases. -''' def fpl_e(p, e, i, t, w, h): + ''' + Calculate the total forward privacy loss at the current + timestamp using the exponential model, i.e., next releases + are grouped in a window of a size that increases exponentially. + + Parameters: + p - The transition matrix representing the forward + temporal correlations. + e - The privacy budget for data publishing. + i - The timestamp of interest. + w - The window size to group next releases. + h - The exponentially increasing coefficient that affects + the window size. + Returns: + a - The forward privacy loss at the current timestamp + due to the next data releases. + ''' if i + w**h < t: # print('fpl_e: %d - %d [%d]' %(i, i + w**h, w**h)) return priv_l(np.linalg.matrix_power(p, w**h), fpl_e(p, e, i + w**h, t, w, h + 1), e[i - 1]) @@ -1057,25 +1058,25 @@ def fpl_e(p, e, i, t, w, h): return e[t - 1] -''' - Calculate the total forward privacy loss at the current - timestamp using the exponential model, i.e., next releases - are grouped in a window of a size that increases exponentially, - using memoization. - - Parameters: - p - The transition matrix representing the forward - temporal correlations. - e - The privacy budget for data publishing. - i - The timestamp of interest. - w - The window size to group next releases. - h - The exponentially increasing coefficient that affects - the window size. - Returns: - a - The forward privacy loss at the current timestamp - due to the next data releases. -''' def fpl_e_m(p, e, i, t, w, h): + ''' + Calculate the total forward privacy loss at the current + timestamp using the exponential model, i.e., next releases + are grouped in a window of a size that increases exponentially, + using memoization. + + Parameters: + p - The transition matrix representing the forward + temporal correlations. + e - The privacy budget for data publishing. + i - The timestamp of interest. + w - The window size to group next releases. + h - The exponentially increasing coefficient that affects + the window size. + Returns: + a - The forward privacy loss at the current timestamp + due to the next data releases. + ''' if i + w**h < t: return priv_l_m(np.linalg.matrix_power(p, w**h), fpl_e_m(p, e, i + w**h, t, w, h + 1), e[i - 1]) elif i + w**h >= t: @@ -1084,41 +1085,41 @@ def fpl_e_m(p, e, i, t, w, h): return e[t - 1] -''' - Calculate the total privacy loss at every timestamp. - - Parameters: - bpl - The backward privacy loss. - fpl - The forward privacy loss. - e - The privacy budget for data publishing. - Returns: - The list of total privacy loss at every timestamp. -''' def tpl(bpl, fpl, e): + ''' + Calculate the total privacy loss at every timestamp. + + Parameters: + bpl - The backward privacy loss. + fpl - The forward privacy loss. + e - The privacy budget for data publishing. + Returns: + The list of total privacy loss at every timestamp. + ''' return [x + y - z for (x, y, z) in zip(bpl, fpl, e)] -''' - Calculate the temporal privacy loss at every timestamp - taking into account landmarks. - - Parameters: - e - The privacy budget for data publishing. - p_b - The transition matrix representing the backward - temporal correlations. - p_f - The transition matrix representing the forward - temporal correlations. - seq - The point sequence. - lmdks - The landmarks. - Returns: - a_b - The backward privacy loss at the current timestamp - due to the previous data releases. - a_f - The forward privacy loss at the current timestamp - due to the next data releases. - a - The total privacy loss at every timestamp - taking into account landmarks. -''' def tpl_lmdk_mem(e, p_b, p_f, seq, lmdks): + ''' + Calculate the temporal privacy loss at every timestamp + taking into account landmarks. + + Parameters: + e - The privacy budget for data publishing. + p_b - The transition matrix representing the backward + temporal correlations. + p_f - The transition matrix representing the forward + temporal correlations. + seq - The point sequence. + lmdks - The landmarks. + Returns: + a_b - The backward privacy loss at the current timestamp + due to the previous data releases. + a_f - The forward privacy loss at the current timestamp + due to the next data releases. + a - The total privacy loss at every timestamp + taking into account landmarks. + ''' a_b = np.zeros(len(seq)) a_f = np.zeros(len(seq)) a = np.zeros(len(seq)) @@ -1135,18 +1136,18 @@ def tpl_lmdk_mem(e, p_b, p_f, seq, lmdks): return a_b, a_f, a -''' - Get the limits for the calculation of temporal privacy loss. - - Parameters: - t - The current timestamp. - seq - The point sequence. - lmdks - The landmarks. - Returns: - t_prv - The previous landmark. - t_nxt - The next landmark. -''' def get_limits(t, seq, lmdks): + ''' + Get the limits for the calculation of temporal privacy loss. + + Parameters: + t - The current timestamp. + seq - The point sequence. + lmdks - The landmarks. + Returns: + t_prv - The previous landmark. + t_nxt - The next landmark. + ''' # Add landmark limits. seq_lmdks = np.copy(lmdks) # if seq[0] not in seq_lmdks: @@ -1174,19 +1175,19 @@ def get_limits(t, seq, lmdks): return t_prv, t_nxt -''' - Plots the privacy loss of the time series. - - Parameters: - title - The title of the plot. - e - The privacy budget for data publishing. - a_b - The backward privacy loss. - a_f - The forward privacy loss. - a - The total privacy loss. - Returns: - Nothing. -''' def plot_loss(title, e, a_b, a_f, a): + ''' + Plots the privacy loss of the time series. + + Parameters: + title - The title of the plot. + e - The privacy budget for data publishing. + a_b - The backward privacy loss. + a_f - The forward privacy loss. + a - The total privacy loss. + Returns: + Nothing. + ''' plt.rc('font', family='serif') plt.rc('font', size=10) plt.rc('text', usetex=True) @@ -1221,19 +1222,19 @@ def plot_loss(title, e, a_b, a_f, a): plt.show() -''' - Plots a comparison of the privacy loss of all models. - - Parameters: - title - The title of the plot. - a - The privacy loss of the basic model. - a_s - The privacy loss of the static model. - a_e - The privacy loss of the exponential model. - a_l - The privacy loss of the linear model. - Returns: - Nothing. -''' def cmp_loss(title, a, a_s, a_e, a_l): + ''' + Plots a comparison of the privacy loss of all models. + + Parameters: + title - The title of the plot. + a - The privacy loss of the basic model. + a_s - The privacy loss of the static model. + a_e - The privacy loss of the exponential model. + a_l - The privacy loss of the linear model. + Returns: + Nothing. + ''' plt.rc('font', family='serif') plt.rc('font', size=10) plt.rc('text', usetex=True) @@ -1268,23 +1269,23 @@ def cmp_loss(title, a, a_s, a_e, a_l): plt.show() -''' - Parse arguments. - - Mandatory: - model, The model to be used: (b)asic, - (s)tatic, (l)inear, (e)xponential. - - Optional: - -c, --correlation, The correlation degree. - -d, --debug, Enable debugging mode. - -e, --epsilon, The available privacy budget. - -m, --matrix, The size of the transition matrix. - -o, --output, The path to the output directory. - -t, --time, The time limit. - -w, --window, The size of the event protection window. -''' def parse_args(): + ''' + Parse arguments. + + Mandatory: + model, The model to be used: (b)asic, + (s)tatic, (l)inear, (e)xponential. + + Optional: + -c, --correlation, The correlation degree. + -d, --debug, Enable debugging mode. + -e, --epsilon, The available privacy budget. + -m, --matrix, The size of the transition matrix. + -o, --output, The path to the output directory. + -t, --time, The time limit. + -w, --window, The size of the event protection window. + ''' # Create argument parser. parser = argparse.ArgumentParser() diff --git a/text/evaluation/summary.tex b/text/evaluation/summary.tex index fa58627..8899e25 100644 --- a/text/evaluation/summary.tex +++ b/text/evaluation/summary.tex @@ -7,4 +7,4 @@ The {\thething} selection module introduces a reasonable data utility decline to % \kat{it would be nice to see it clearly on Figure 5.5. (eg, by including another bar that shows adaptive without landmark selection)} % \mk{Done.} In terms of temporal correlation, we observe that under moderate and strong temporal correlation, a greater average regular--{\thething} event distance in a {\thething} distribution causes greater overall privacy loss. -Finally, the contribution of the {\thething} privacy on enhancing the data utility, while preserving $\epsilon$-differential privacy, is demonstrated by the fact that the selected Adaptive scheme provides better data utility than the user-level privacy protection. +Finally, the contribution of the {\thething} privacy on enhancing the data utility, while preserving $\varepsilon$-differential privacy, is demonstrated by the fact that the selected Adaptive scheme provides better data utility than the user-level privacy protection. diff --git a/text/problem/thething/main.tex b/text/problem/thething/main.tex index e768fd0..2136c23 100644 --- a/text/problem/thething/main.tex +++ b/text/problem/thething/main.tex @@ -22,7 +22,7 @@ Take for example the scenario in Figure~\ref{fig:st-cont}, where {\thethings} ar If we want to protect the {\thething} points, we have to allocate at most a budget of $\varepsilon$ to the {\thethings}, while saving some for the release of regular events. Essentially, the more budget we allocate to an event the less we protect it, but at the same time we maintain its utility. With {\thething} privacy we propose to distribute the budget taking into account only the existence of the {\thethings} when we release an event of the time series, i.e.,~allocating $\frac{\varepsilon}{5}$ ($4\ \text{\thethings} + 1\ \text{regular point}$) to each event (see Figure~\ref{fig:st-cont}). -This way, we still guarantee\footnote{$\epsilon$-differential privacy guarantees that the allocated budget should be less or equal to $\epsilon$, and not precisely how much.\kat{Mano check.}} that the {\thethings} are adequately protected, as they receive a total budget of $\frac{4\varepsilon}{5}<\varepsilon$. +This way, we still guarantee\footnote{$\varepsilon$-differential privacy guarantees that the allocated budget should be less or equal to $\varepsilon$, and not precisely how much.\kat{Mano check.}} that the {\thethings} are adequately protected, as they receive a total budget of $\frac{4\varepsilon}{5}<\varepsilon$. At the same time, we avoid over-perturbing the regular events, as we allocate to them a higher total budget ($\frac{4\varepsilon}{5}$) compared to the user-level scenario ($\frac{\varepsilon}{2}$), and thus less noise. diff --git a/text/problem/thething/solution.tex b/text/problem/thething/solution.tex index cbe7259..3c4577c 100644 --- a/text/problem/thething/solution.tex +++ b/text/problem/thething/solution.tex @@ -77,7 +77,7 @@ Intuitively, knowing the data set at timestamp $t$ stops the propagation of the %\kat{do we see this in the formula 1 ?} %when calculating the forward or backward privacy loss respectively. -Cao et al.~\cite{cao2017quantifying} propose a method for computing the total temporal privacy loss $\alpha_t$ at a timestamp $t$ as the sum of the backward and forward privacy loss, $\alpha^B_t$ and $\alpha^F_t$, minus the privacy budget $\varepsilon_t$ +Cao et al.~\cite{cao2017quantifying} propose a method for computing the temporal privacy loss $\alpha_t$ at a timestamp $t$ as the sum of the backward and forward privacy loss, $\alpha^B_t$ and $\alpha^F_t$, minus the privacy budget $\varepsilon_t$ to account for the extra privacy loss due to previous and next releases $\pmb{o}$ of $\mathcal{M}$ under temporal correlation. By Theorem~\ref{theor:thething-prv}, at every timestamp $t$ we consider the data at $t$ and at the {\thething} timestamps $L$. %According to the Definitions~{\ref{def:bpl} and \ref{def:fpl}}, we calculate the backward and forward privacy loss by taking into account the privacy budget at previous and next data releases respectively.