diff --git a/code/lib/gdp.py b/code/lib/gdp.py index 8467e28..52da294 100644 --- a/code/lib/gdp.py +++ b/code/lib/gdp.py @@ -28,15 +28,15 @@ MISS = 0 # Number of additions to the cache. TOTAL = 0 # Number of cache accesses. -''' - Read data from a file. - - Parameters: - path - The relative path to the data file. - Returns: - data - A list of tuples [uid, timestamp, lng, lat, loc]. -''' def load_data(path): + ''' + Read data from a file. + + Parameters: + path - The relative path to the data file. + Returns: + data - A list of tuples [uid, timestamp, lng, lat, loc]. + ''' print('Loading data from', os.path.abspath(path), '... ', end='') data = [] try: @@ -50,20 +50,20 @@ def load_data(path): exit() -''' - Save output to a file. - - Parameters: - path - The relative path to the output file. - t - The number of timestamps. - e - The privacy budget at each timestamp. - a_b - The backward privacy loss at each timestamp. - a_f - The forward privacy loss at each timestamp. - a - The temporal privacy loss at each timestamp. - Returns: - Nothing. -''' def save_output(path, t, e, a_b, a_f, a): + ''' + Save output to a file. + + Parameters: + path - The relative path to the output file. + t - The number of timestamps. + e - The privacy budget at each timestamp. + a_b - The backward privacy loss at each timestamp. + a_f - The forward privacy loss at each timestamp. + a - The temporal privacy loss at each timestamp. + Returns: + Nothing. + ''' # timestamp = time.strftime('%Y%m%d%H%M%S') print('Saving output to %s... ' %(path), end='', flush=True) os.makedirs(os.path.dirname(path), exist_ok=True) @@ -74,15 +74,15 @@ def save_output(path, t, e, a_b, a_f, a): print('OK.', flush=True) -''' - Get all the timestamps from the input data. - - Parameters: - data - The input data set. - Returns: - timestamps - An ndarray of all of the timestamps from the input data. -''' def get_timestamps(data): + ''' + Get all the timestamps from the input data. + + Parameters: + data - The input data set. + Returns: + timestamps - An ndarray of all of the timestamps from the input data. + ''' print('Getting a list of all timestamps... ', end='', flush=True) timestamps = np.sort(np.unique(np.array(data)[:, 1])) if not len(timestamps): @@ -103,15 +103,15 @@ def get_timestamps(data): return timestamps -''' - Get all the unique locations from the input data. - - Parameters: - data - The input data set. - Returns: - locs - A sorted ndarray of all the unique locations int the input data. -''' def get_locs(data): + ''' + Get all the unique locations from the input data. + + Parameters: + data - The input data set. + Returns: + locs - A sorted ndarray of all the unique locations int the input data. + ''' print('Getting a list of all locations... ', end='', flush=True) locs = np.sort(np.unique(np.array(data)[:, 4].astype(np.int))) if not len(locs): @@ -123,16 +123,16 @@ def get_locs(data): return list(map(str, locs)) -''' - Get the counts at every location for a specific timestamp. - - Parameters: - data - The input data set. - t - The timestamp of interest. - Returns: - cnts - A dict {loc:cnt} with the counts at every location for a specific timestamp. -''' def get_cnts(data, t): + ''' + Get the counts at every location for a specific timestamp. + + Parameters: + data - The input data set. + t - The timestamp of interest. + Returns: + cnts - A dict {loc:cnt} with the counts at every location for a specific timestamp. + ''' print('Getting all counts at %s... ' %(t), end='', flush=True) locs = get_locs(data) cnts = dict.fromkeys(locs, 0) @@ -145,15 +145,15 @@ def get_cnts(data, t): return cnts -''' - Get the counts at every location for every timestamp. - - Parameters: - data - The input data set. - Returns: - cnts - A dict {timestamp:loc} with all the counts at every location for every timestamp. -''' def get_all_cnts(data): + ''' + Get the counts at every location for every timestamp. + + Parameters: + data - The input data set. + Returns: + cnts - A dict {timestamp:loc} with all the counts at every location for every timestamp. + ''' cnts = {} for d in data: key = d[1] + '@' + d[4] @@ -163,15 +163,15 @@ def get_all_cnts(data): return cnts -''' - Get a list of unique users in the input data set. - - Parameters: - data - The input data set. - Returns: - users - An ndarray of all unique users. -''' def get_usrs(data): + ''' + Get a list of unique users in the input data set. + + Parameters: + data - The input data set. + Returns: + users - An ndarray of all unique users. + ''' users = np.sort(np.unique(np.array(data)[:, 0].astype(np.int))) if not len(users): print('No users found.') @@ -181,16 +181,16 @@ def get_usrs(data): return users -''' - Get the data of a particular user from a data set. - - Parameters: - data - The input data set. - id - The user identifier. - Returns: - output - A list of the data of the targeted user. -''' def get_usr_data(data, id): + ''' + Get the data of a particular user from a data set. + + Parameters: + data - The input data set. + id - The user identifier. + Returns: + output - A list of the data of the targeted user. + ''' output = [] for d in data: if (d[0] == str(id)): @@ -200,30 +200,30 @@ def get_usr_data(data, id): return output -''' - Get the data of every user in a data set. - - Parameters: - data - The input data set. - Returns: - output - A dict {usr, [usr_data]} with the data of each user. -''' def get_usrs_data(data): + ''' + Get the data of every user in a data set. + + Parameters: + data - The input data set. + Returns: + output - A dict {usr, [usr_data]} with the data of each user. + ''' output = {} for d in data: output[d[0]] = output.get(d[0], []) + [d] return output -''' - Get the trajectory of a user from her data. - - Parameters: - data - The data of the user. - Returns: - traj - A list [(timestamp, loc)] with the locations and corresponding timestamps that the user was at. -''' def get_usr_traj(data): + ''' + Get the trajectory of a user from her data. + + Parameters: + data - The data of the user. + Returns: + traj - A list [(timestamp, loc)] with the locations and corresponding timestamps that the user was at. + ''' traj = [] for d in data: traj.append((d[1], d[4])) @@ -232,15 +232,15 @@ def get_usr_traj(data): return traj -''' - Get all the possible transitions. - - Parameters: - data - The input data set. - Returns: - trans - A set with all the possible forward transitions in the input. -''' def get_poss_trans(data): + ''' + Get all the possible transitions. + + Parameters: + data - The input data set. + Returns: + trans - A set with all the possible forward transitions in the input. + ''' print('Getting possible transitions... ', end='', flush=True) trans = set() for u, u_data in data.items(): @@ -253,16 +253,16 @@ def get_poss_trans(data): return trans -''' - Get all backward transitions in a data set. - - Parameters: - data - The input data set. - Returns: - trans - A dict {(t, t-1):[transitions]} with all the backward transitions - at every sequential timestamp pair in the input data set. -''' def get_bwd_trans(data): + ''' + Get all backward transitions in a data set. + + Parameters: + data - The input data set. + Returns: + trans - A dict {(t, t-1):[transitions]} with all the backward transitions + at every sequential timestamp pair in the input data set. + ''' print('Getting all backward transitions... ', end='', flush=True) trans = {} for u, u_data in data.items(): @@ -276,16 +276,16 @@ def get_bwd_trans(data): return trans -''' - Get all forward transitions in a data set. - - Parameters: - data - The input data set. - Returns: - trans - A dict {(t-1, t):[transitions]} with all the forward transitions - at every sequential timestamp pair in the input data set. -''' def get_fwd_trans(data): + ''' + Get all forward transitions in a data set. + + Parameters: + data - The input data set. + Returns: + trans - A dict {(t-1, t):[transitions]} with all the forward transitions + at every sequential timestamp pair in the input data set. + ''' print('Getting all forward transitions... ', end='', flush=True) trans = {} for u, u_data in data.items(): @@ -299,53 +299,53 @@ def get_fwd_trans(data): return trans -''' - Divide two numbers. If the divisor is 0 return inf. - - Parameters: - a - The dividend. - b - The divisor. - Returns: - The float result of the division. -''' def safe_div(a, b): + ''' + Divide two numbers. If the divisor is 0 return inf. + + Parameters: + a - The dividend. + b - The divisor. + Returns: + The float result of the division. + ''' if b == 0: return math.inf return float(a/b) -''' - Calculate the maximum value of the objective function. - - Parameters: - q - A row from the transition matrix. - d - Another row from the transition matrix. - a - The backward/forward privacy loss of the previous/next - timestamp. - Returns: - The maximum value of the objective function. -''' def max_val(q, d, a): + ''' + Calculate the maximum value of the objective function. + + Parameters: + q - A row from the transition matrix. + d - Another row from the transition matrix. + a - The backward/forward privacy loss of the previous/next + timestamp. + Returns: + The maximum value of the objective function. + ''' if a == math.inf: return math.nan return (q*(math.exp(a) - 1) + 1)/(d*(math.exp(a) - 1) + 1) -''' - Find two different rows (q and d) of a transition matrix (p) - that maximize the product of the objective function and return - their sums. - - Parameters: - p - The transition matrix representing the backward/forward - correlations. - a - The backward/forward privacy loss of the previous/next - timestamp. - Returns: - sum_q - The sum of the elements of q. - sum_d - The sum of the elements of d. -''' def find_qd(p, a): + ''' + Find two different rows (q and d) of a transition matrix (p) + that maximize the product of the objective function and return + their sums. + + Parameters: + p - The transition matrix representing the backward/forward + correlations. + a - The backward/forward privacy loss of the previous/next + timestamp. + Returns: + sum_q - The sum of the elements of q. + sum_d - The sum of the elements of d. + ''' res = 0.0 sum_q, sum_d = 0.0, 0.0 for q in p: # A row from the transition matrix. @@ -374,17 +374,17 @@ def find_qd(p, a): return sum_q, sum_d -''' - Generate data. - - Parameters: - usrs - The number of users. - timestamps - The number of timestamps. - locs - The numner of locations. - Returns: - data - The generated data. -''' def gen_data(usrs, timestamps, locs): + ''' + Generate data. + + Parameters: + usrs - The number of users. + timestamps - The number of timestamps. + locs - The numner of locations. + Returns: + data - The generated data. + ''' print('Generating data... ', end='', flush=True) # Generate timestamps. ts = [] @@ -412,18 +412,18 @@ def gen_data(usrs, timestamps, locs): return data -''' - Generate a transition matrix. - - Parameters: - n - The dimension of the matrix. - s - The correlation degree of each row [0, 1]. - The lower its value, the lower the degree of - uniformity of each row. - Returns: - p_ - The transition matrix. -''' def gen_trans_mt(n, s): + ''' + Generate a transition matrix. + + Parameters: + n - The dimension of the matrix. + s - The correlation degree of each row [0, 1]. + The lower its value, the lower the degree of + uniformity of each row. + Returns: + p_ - The transition matrix. + ''' if DEBUG: print('Generating transition matrix %dx%d with s = %.4f... ' %(n, n, s), end='', flush=True) p = np.zeros((n, n), float) @@ -439,17 +439,17 @@ def gen_trans_mt(n, s): return p_ -''' - Get the transition matrix - - Parameters: - locs - A list of all the locations. - trans - A list of all transitions. - Returns: - p - A 2d dict {{locs}{locs}} containing the - corresponding location transition probabilities. -''' def get_trans_mt(locs, trans): + ''' + Get the transition matrix + + Parameters: + locs - A list of all the locations. + trans - A list of all transitions. + Returns: + p - A 2d dict {{locs}{locs}} containing the + corresponding location transition probabilities. + ''' if DEBUG: print('Generating the transition matrix... ', end='', flush=True) # Initialize the transition matrix. @@ -476,16 +476,16 @@ def get_trans_mt(locs, trans): return p -''' - Calculate the measure-theoretic (Kolmogorov-Sinai) entropy - of a transition matrix. - - Parameters: - mt - A 2d dict transition matrix. - Returns: - h - The Kolmogorov-Sinai entropy of the matrix. -''' def get_entropy(mt): + ''' + Calculate the measure-theoretic (Kolmogorov-Sinai) entropy + of a transition matrix. + + Parameters: + mt - A 2d dict transition matrix. + Returns: + h - The Kolmogorov-Sinai entropy of the matrix. + ''' if DEBUG: print('Calculating the measure-theoretic entropy... ', end='', flush=True) h = 0.0 @@ -523,15 +523,15 @@ def get_entropy(mt): return h -''' - Convert a 2d dict to a 2d array. - - Parameters: - mt - The 2d dict. - Returns: - p - The 2d numpy array. -''' def get_2darray(mt): + ''' + Convert a 2d dict to a 2d array. + + Parameters: + mt - The 2d dict. + Returns: + p - The 2d numpy array. + ''' if type(mt) == type(np.array([])): return mt p = np.zeros((len(mt), len(mt)), float) @@ -540,51 +540,51 @@ def get_2darray(mt): return p -''' - Get a Laplace probability distribution. - - Parameters: - ts - The points of the distribution. - t - The location of the distribution. - sc - The scale of the distribution. - Returns: - The probability distribution. -''' def get_laplace_pd(ts, t, sc): + ''' + Get a Laplace probability distribution. + + Parameters: + ts - The points of the distribution. + t - The location of the distribution. + sc - The scale of the distribution. + Returns: + The probability distribution. + ''' x = np.arange(0, len(ts), 1) loc = np.where(ts == t) return laplace.pdf(x, loc=loc, scale=sc)[0] -''' - Get a Gaussian probability distribution. - - Parameters: - ts - The points of the distribution. - t - The location of the distribution. - sc - The scale of the distribution. - Returns: - The probability distribution. -''' def get_norm_pd(ts, t, sc): + ''' + Get a Gaussian probability distribution. + + Parameters: + ts - The points of the distribution. + t - The location of the distribution. + sc - The scale of the distribution. + Returns: + The probability distribution. + ''' x = np.arange(0, len(ts), 1) loc = np.where(ts == t) return norm.pdf(x, loc=loc, scale=sc)[0] -''' - Get a sample from the time series. - - Parameters: - ts - An ndarray of the timestamps. - t - The current timestamp. - pd - The probability distribution. - ptn - The desired portion [0, 1] of the non-zero elements - of the probability distribution to be sampled. - Returns: - spl - An ndarray of the sampled timestamps. -''' def get_sample(ts, t, pct, pd): + ''' + Get a sample from the time series. + + Parameters: + ts - An ndarray of the timestamps. + t - The current timestamp. + pd - The probability distribution. + ptn - The desired portion [0, 1] of the non-zero elements + of the probability distribution to be sampled. + Returns: + spl - An ndarray of the sampled timestamps. + ''' if DEBUG: print('Sampling %.2f%% of %s at %s... ' %(pct*100, ts, t), end='', flush=True) # Check that it is a valid timestamp. @@ -604,38 +604,38 @@ def get_sample(ts, t, pct, pd): return spl -''' - Calculate the backward/forward privacy loss at the current - timestamp. - - Parameters: - p - The transition matrix representing the backward/forward - temporal correlations. - a - The privacy loss of the previous/next timestamp. - e - The privacy budget for data publishing. - Returns: - The backward/forward privacy loss at the current - timestamp. -''' def priv_l(p, a, e): + ''' + Calculate the backward/forward privacy loss at the current + timestamp. + + Parameters: + p - The transition matrix representing the backward/forward + temporal correlations. + a - The privacy loss of the previous/next timestamp. + e - The privacy budget for data publishing. + Returns: + The backward/forward privacy loss at the current + timestamp. + ''' sum_q, sum_d = find_qd(p, a) return math.log(max_val(sum_q, sum_d, a)) + e -''' - Calculate the backward/forward privacy loss at the current - timestamp using memoization. - - Parameters: - p - The transition matrix representing the backward/forward - temporal correlations. - a - The privacy loss of the previous/next timestamp. - e - The privacy budget for data publishing. - Returns: - The backward/forward privacy loss at the current - timestamp. -''' def priv_l_m(p, a, e): + ''' + Calculate the backward/forward privacy loss at the current + timestamp using memoization. + + Parameters: + p - The transition matrix representing the backward/forward + temporal correlations. + a - The privacy loss of the previous/next timestamp. + e - The privacy budget for data publishing. + Returns: + The backward/forward privacy loss at the current + timestamp. + ''' key = xxhash.xxh64(p).hexdigest() + str(a) + str(e) global MEM, TOTAL, MISS TOTAL += 1 @@ -648,47 +648,48 @@ def priv_l_m(p, a, e): return result -''' - Calculate the total backward privacy loss at every timestamp. - - Parameters: - p - The transition matrix representing the backward - temporal correlations. - a - The backward privacy loss of every release. - e - The privacy budget for data publishing. - t - The time limit. - Returns: - a - The backward privacy loss at every timestamp - due to the previous data releases. -''' def bpl(p, a, e, t): + ''' + Calculate the total backward privacy loss at every timestamp. + + Parameters: + p - The transition matrix representing the backward + temporal correlations. + a - The backward privacy loss of every release. + e - The privacy budget for data publishing. + t - The time limit. + Returns: + a - The backward privacy loss at every timestamp + due to the previous data releases. + ''' a[0] = e[0] for i in range(1, t): a[i] = priv_l(p, a[i - 1], e[i]) return a -''' - Calculate the total backward privacy loss at the current - timestamp with memoization. - - Parameters: - p - The transition matrix representing the backward - temporal correlations. - a - The backward privacy loss of the current release - at all previous timestamps. - e - The privacy budget for data publishing. - t - The time limit. - Returns: - a - The backward privacy loss at the current timestamp - due to the previous data releases. -''' def bpl_m(p, a, e, t): + ''' + Calculate the total backward privacy loss at the current + timestamp with memoization. + + Parameters: + p - The transition matrix representing the backward + temporal correlations. + a - The backward privacy loss of the current release + at all previous timestamps. + e - The privacy budget for data publishing. + t - The time limit. + Returns: + a - The backward privacy loss at the current timestamp + due to the previous data releases. + ''' a[0] = e[0] for i in range(1, t): a[i] = priv_l_m(p, a[i - 1], e[i]) return a + def bpl_lmdk_mem(p, a, e, t, lmdk): # t is (near) the landmark if lmdk == t - 1 or t == lmdk: @@ -702,22 +703,22 @@ def bpl_lmdk_mem(p, a, e, t, lmdk): return a -''' - Calculate the total backward privacy loss at the current - timestamp using the static model, i.e., previous releases - are grouped in a window of static size. - - Parameters: - p - The transition matrix representing the backward - temporal correlations. - e - The privacy budget for data publishing. - i - The timestamp of interest. - w - The window size to group previous releases. - Returns: - a - The backward privacy loss at the current timestamp - due to the previous data releases. -''' def bpl_s(p, e, i, w): + ''' + Calculate the total backward privacy loss at the current + timestamp using the static model, i.e., previous releases + are grouped in a window of static size. + + Parameters: + p - The transition matrix representing the backward + temporal correlations. + e - The privacy budget for data publishing. + i - The timestamp of interest. + w - The window size to group previous releases. + Returns: + a - The backward privacy loss at the current timestamp + due to the previous data releases. + ''' if i - w > 1: # print('bpl_s: %d - %d [%d]' %(i, i - w, w)) return priv_l(np.linalg.matrix_power(p, w), bpl_s(p, e, i - w, w), e[i - 1]) @@ -729,22 +730,22 @@ def bpl_s(p, e, i, w): return e[0] -''' - Calculate the total backward privacy loss at the current - timestamp using the static model, i.e., previous releases - are grouped in a window of static size, using memoization. - - Parameters: - p - The transition matrix representing the backward - temporal correlations. - e - The privacy budget for data publishing. - i - The timestamp of interest. - w - The window size to group previous releases. - Returns: - a - The backward privacy loss at the current timestamp - due to the previous data releases. -''' def bpl_s_m(p, e, i, w): + ''' + Calculate the total backward privacy loss at the current + timestamp using the static model, i.e., previous releases + are grouped in a window of static size, using memoization. + + Parameters: + p - The transition matrix representing the backward + temporal correlations. + e - The privacy budget for data publishing. + i - The timestamp of interest. + w - The window size to group previous releases. + Returns: + a - The backward privacy loss at the current timestamp + due to the previous data releases. + ''' if i - w > 1: return priv_l_m(np.linalg.matrix_power(p, w), bpl_s_m(p, e, i - w, w), e[i - 1]) elif i - w <= 1: @@ -753,24 +754,24 @@ def bpl_s_m(p, e, i, w): return e[0] -''' - Calculate the total backward privacy loss at the current - timestamp using the linear model, i.e., previous releases - are grouped in a window of a size that increases linearly. - - Parameters: - p - The transition matrix representing the backward - temporal correlations. - e - The privacy budget for data publishing. - i - The timestamp of interest. - w - The window size to group previous releases. - l - The linearly increasing coefficient that affects the - window size. - Returns: - a - The backward privacy loss at the current timestamp - due to the previous data releases. -''' def bpl_l(p, e, i, w, l): + ''' + Calculate the total backward privacy loss at the current + timestamp using the linear model, i.e., previous releases + are grouped in a window of a size that increases linearly. + + Parameters: + p - The transition matrix representing the backward + temporal correlations. + e - The privacy budget for data publishing. + i - The timestamp of interest. + w - The window size to group previous releases. + l - The linearly increasing coefficient that affects the + window size. + Returns: + a - The backward privacy loss at the current timestamp + due to the previous data releases. + ''' if i - w*l > 1: # print('bpl_l: %d - %d [%d]' %(i, i - w*l, w*l)) return priv_l(np.linalg.matrix_power(p, w*l), bpl_l(p, e, i - w*l, w, l + 1), e[i - 1]) @@ -782,25 +783,25 @@ def bpl_l(p, e, i, w, l): return e[0] -''' - Calculate the total backward privacy loss at the current - timestamp using the linear model, i.e., previous releases - are grouped in a window of a size that increases linearly, - using memoization. - - Parameters: - p - The transition matrix representing the backward - temporal correlations. - e - The privacy budget for data publishing. - i - The timestamp of interest. - w - The window size to group previous releases. - l - The linearly increasing coefficient that affects the - window size. - Returns: - a - The backward privacy loss at the current timestamp - due to the previous data releases. -''' def bpl_l_m(p, e, i, w, l): + ''' + Calculate the total backward privacy loss at the current + timestamp using the linear model, i.e., previous releases + are grouped in a window of a size that increases linearly, + using memoization. + + Parameters: + p - The transition matrix representing the backward + temporal correlations. + e - The privacy budget for data publishing. + i - The timestamp of interest. + w - The window size to group previous releases. + l - The linearly increasing coefficient that affects the + window size. + Returns: + a - The backward privacy loss at the current timestamp + due to the previous data releases. + ''' if i - w*l > 1: return priv_l_m(np.linalg.matrix_power(p, w*l), bpl_l_m(p, e, i - w*l, w, l + 1), e[i - 1]) elif i - w*l <= 1: @@ -809,24 +810,24 @@ def bpl_l_m(p, e, i, w, l): return e[0] -''' - Calculate the total backward privacy loss at the current - timestamp using the exponential model, i.e., previous releases - are grouped in a window of a size that increases exponentially. - - Parameters: - p - The transition matrix representing the backward - temporal correlations. - e - The privacy budget for data publishing. - i - The timestamp of interest. - w - The window size to group previous releases. - h - The exponentially increasing coefficient that affects - the window size. - Returns: - a - The backward privacy loss at the current timestamp - due to the previous data releases. -''' def bpl_e(p, e, i, w, h): + ''' + Calculate the total backward privacy loss at the current + timestamp using the exponential model, i.e., previous releases + are grouped in a window of a size that increases exponentially. + + Parameters: + p - The transition matrix representing the backward + temporal correlations. + e - The privacy budget for data publishing. + i - The timestamp of interest. + w - The window size to group previous releases. + h - The exponentially increasing coefficient that affects + the window size. + Returns: + a - The backward privacy loss at the current timestamp + due to the previous data releases. + ''' if i - w**h > 1: # print('bpl_e: %d - %d [%d]' %(i, i - w**h, w**h)) return priv_l(np.linalg.matrix_power(p, w**h), bpl_e(p, e, i - w**h, w, h + 1), e[i - 1]) @@ -838,25 +839,25 @@ def bpl_e(p, e, i, w, h): return e[0] -''' - Calculate the total backward privacy loss at the current - timestamp using the exponential model, i.e., previous releases - are grouped in a window of a size that increases exponentially, - using memoization. - - Parameters: - p - The transition matrix representing the backward - temporal correlations. - e - The privacy budget for data publishing. - i - The timestamp of interest. - w - The window size to group previous releases. - h - The exponentially increasing coefficient that affects - the window size. - Returns: - a - The backward privacy loss at the current timestamp - due to the previous data releases. -''' def bpl_e_m(p, e, i, w, h): + ''' + Calculate the total backward privacy loss at the current + timestamp using the exponential model, i.e., previous releases + are grouped in a window of a size that increases exponentially, + using memoization. + + Parameters: + p - The transition matrix representing the backward + temporal correlations. + e - The privacy budget for data publishing. + i - The timestamp of interest. + w - The window size to group previous releases. + h - The exponentially increasing coefficient that affects + the window size. + Returns: + a - The backward privacy loss at the current timestamp + due to the previous data releases. + ''' if i - w**h > 1: return priv_l_m(np.linalg.matrix_power(p, w**h), bpl_e_m(p, e, i - w**h, w, h + 1), e[i - 1]) elif i - w**h <= 1: @@ -865,44 +866,44 @@ def bpl_e_m(p, e, i, w, h): return e[0] -''' - Calculate the total forward privacy loss at the current - timestamp. - - Parameters: - p - The transition matrix representing the forward - temporal correlations. - a - The forward privacy loss of the current release - at all next timestamps. - e - The privacy budget for data publishing. - t - The time limit. - Returns: - a - The forward privacy loss at the current timestamp - due to the next data releases. -''' def fpl(p, a, e, t): + ''' + Calculate the total forward privacy loss at the current + timestamp. + + Parameters: + p - The transition matrix representing the forward + temporal correlations. + a - The forward privacy loss of the current release + at all next timestamps. + e - The privacy budget for data publishing. + t - The time limit. + Returns: + a - The forward privacy loss at the current timestamp + due to the next data releases. + ''' a[t - 1] = e[t - 1] for i in range(t - 2, -1, -1): a[i] = priv_l(p, a[i + 1], e[i]) return a -''' - Calculate the total forward privacy loss at the current - timestamp, using memoization. - - Parameters: - p - The transition matrix representing the forward - temporal correlations. - a - The forward privacy loss of the current release - at all next timestamps. - e - The privacy budget for data publishing. - t - The time limit. - Returns: - a - The forward privacy loss at the current timestamp - due to the next data releases. -''' def fpl_m(p, a, e, t): + ''' + Calculate the total forward privacy loss at the current + timestamp, using memoization. + + Parameters: + p - The transition matrix representing the forward + temporal correlations. + a - The forward privacy loss of the current release + at all next timestamps. + e - The privacy budget for data publishing. + t - The time limit. + Returns: + a - The forward privacy loss at the current timestamp + due to the next data releases. + ''' a[t - 1] = e[t - 1] for i in range(t - 2, -1, -1): a[i] = priv_l_m(p, a[i + 1], e[i]) @@ -921,22 +922,22 @@ def fpl_lmdk_mem(p, a, e, t, lmdk): return a -''' - Calculate the total forward privacy loss at the current - timestamp using the static model, i.e., next releases - are grouped in a window of static size. - - Parameters: - p - The transition matrix representing the forward - temporal correlations. - e - The privacy budget for data publishing. - i - The timestamp of interest. - w - The window size to group next releases. - Returns: - a - The forward privacy loss at the current timestamp - due to the next data releases. -''' def fpl_s(p, e, i, t, w): + ''' + Calculate the total forward privacy loss at the current + timestamp using the static model, i.e., next releases + are grouped in a window of static size. + + Parameters: + p - The transition matrix representing the forward + temporal correlations. + e - The privacy budget for data publishing. + i - The timestamp of interest. + w - The window size to group next releases. + Returns: + a - The forward privacy loss at the current timestamp + due to the next data releases. + ''' if i + w < t: # print('fpl_s: %d - %d [%d]' %(i, i + w, w)) return priv_l(np.linalg.matrix_power(p, w), fpl_s(p, e, i + w, t, w), e[i - 1]) @@ -948,22 +949,22 @@ def fpl_s(p, e, i, t, w): return e[t - 1] -''' - Calculate the total forward privacy loss at the current - timestamp using the static model, i.e., next releases - are grouped in a window of static size, using memoization. - - Parameters: - p - The transition matrix representing the forward - temporal correlations. - e - The privacy budget for data publishing. - i - The timestamp of interest. - w - The window size to group next releases. - Returns: - a - The forward privacy loss at the current timestamp - due to the next data releases. -''' def fpl_s_m(p, e, i, t, w): + ''' + Calculate the total forward privacy loss at the current + timestamp using the static model, i.e., next releases + are grouped in a window of static size, using memoization. + + Parameters: + p - The transition matrix representing the forward + temporal correlations. + e - The privacy budget for data publishing. + i - The timestamp of interest. + w - The window size to group next releases. + Returns: + a - The forward privacy loss at the current timestamp + due to the next data releases. + ''' if i + w < t: return priv_l_m(np.linalg.matrix_power(p, w), fpl_s_m(p, e, i + w, t, w), e[i - 1]) elif i + w >= t: @@ -972,24 +973,24 @@ def fpl_s_m(p, e, i, t, w): return e[t - 1] -''' - Calculate the total forward privacy loss at the current - timestamp using the linear model, i.e., next releases - are grouped in a window of a size that increases linearly. - - Parameters: - p - The transition matrix representing the forward - temporal correlations. - e - The privacy budget for data publishing. - i - The timestamp of interest. - w - The window size to group next releases. - l - The linearly increasing coefficient that affects the - window size. - Returns: - a - The forward privacy loss at the current timestamp - due to the next data releases. -''' def fpl_l(p, e, i, t, w, l): + ''' + Calculate the total forward privacy loss at the current + timestamp using the linear model, i.e., next releases + are grouped in a window of a size that increases linearly. + + Parameters: + p - The transition matrix representing the forward + temporal correlations. + e - The privacy budget for data publishing. + i - The timestamp of interest. + w - The window size to group next releases. + l - The linearly increasing coefficient that affects the + window size. + Returns: + a - The forward privacy loss at the current timestamp + due to the next data releases. + ''' if i + w*l < t: # print('fpl_l: %d - %d [%d]' %(i, i + w*l, w*l)) return priv_l(np.linalg.matrix_power(p, w*l), fpl_l(p, e, i + w*l, t, w, l + 1), e[i - 1]) @@ -1001,25 +1002,25 @@ def fpl_l(p, e, i, t, w, l): return e[t - 1] -''' - Calculate the total forward privacy loss at the current - timestamp using the linear model, i.e., next releases - are grouped in a window of a size that increases linearly, - using memoization. - - Parameters: - p - The transition matrix representing the forward - temporal correlations. - e - The privacy budget for data publishing. - i - The timestamp of interest. - w - The window size to group next releases. - l - The linearly increasing coefficient that affects the - window size. - Returns: - a - The forward privacy loss at the current timestamp - due to the next data releases. -''' def fpl_l_m(p, e, i, t, w, l): + ''' + Calculate the total forward privacy loss at the current + timestamp using the linear model, i.e., next releases + are grouped in a window of a size that increases linearly, + using memoization. + + Parameters: + p - The transition matrix representing the forward + temporal correlations. + e - The privacy budget for data publishing. + i - The timestamp of interest. + w - The window size to group next releases. + l - The linearly increasing coefficient that affects the + window size. + Returns: + a - The forward privacy loss at the current timestamp + due to the next data releases. + ''' if i + w*l < t: return priv_l_m(np.linalg.matrix_power(p, w*l), fpl_l_m(p, e, i + w*l, t, w, l + 1), e[i - 1]) elif i + w*l >= t: @@ -1028,24 +1029,24 @@ def fpl_l_m(p, e, i, t, w, l): return e[t - 1] -''' - Calculate the total forward privacy loss at the current - timestamp using the exponential model, i.e., next releases - are grouped in a window of a size that increases exponentially. - - Parameters: - p - The transition matrix representing the forward - temporal correlations. - e - The privacy budget for data publishing. - i - The timestamp of interest. - w - The window size to group next releases. - h - The exponentially increasing coefficient that affects - the window size. - Returns: - a - The forward privacy loss at the current timestamp - due to the next data releases. -''' def fpl_e(p, e, i, t, w, h): + ''' + Calculate the total forward privacy loss at the current + timestamp using the exponential model, i.e., next releases + are grouped in a window of a size that increases exponentially. + + Parameters: + p - The transition matrix representing the forward + temporal correlations. + e - The privacy budget for data publishing. + i - The timestamp of interest. + w - The window size to group next releases. + h - The exponentially increasing coefficient that affects + the window size. + Returns: + a - The forward privacy loss at the current timestamp + due to the next data releases. + ''' if i + w**h < t: # print('fpl_e: %d - %d [%d]' %(i, i + w**h, w**h)) return priv_l(np.linalg.matrix_power(p, w**h), fpl_e(p, e, i + w**h, t, w, h + 1), e[i - 1]) @@ -1057,25 +1058,25 @@ def fpl_e(p, e, i, t, w, h): return e[t - 1] -''' - Calculate the total forward privacy loss at the current - timestamp using the exponential model, i.e., next releases - are grouped in a window of a size that increases exponentially, - using memoization. - - Parameters: - p - The transition matrix representing the forward - temporal correlations. - e - The privacy budget for data publishing. - i - The timestamp of interest. - w - The window size to group next releases. - h - The exponentially increasing coefficient that affects - the window size. - Returns: - a - The forward privacy loss at the current timestamp - due to the next data releases. -''' def fpl_e_m(p, e, i, t, w, h): + ''' + Calculate the total forward privacy loss at the current + timestamp using the exponential model, i.e., next releases + are grouped in a window of a size that increases exponentially, + using memoization. + + Parameters: + p - The transition matrix representing the forward + temporal correlations. + e - The privacy budget for data publishing. + i - The timestamp of interest. + w - The window size to group next releases. + h - The exponentially increasing coefficient that affects + the window size. + Returns: + a - The forward privacy loss at the current timestamp + due to the next data releases. + ''' if i + w**h < t: return priv_l_m(np.linalg.matrix_power(p, w**h), fpl_e_m(p, e, i + w**h, t, w, h + 1), e[i - 1]) elif i + w**h >= t: @@ -1084,41 +1085,41 @@ def fpl_e_m(p, e, i, t, w, h): return e[t - 1] -''' - Calculate the total privacy loss at every timestamp. - - Parameters: - bpl - The backward privacy loss. - fpl - The forward privacy loss. - e - The privacy budget for data publishing. - Returns: - The list of total privacy loss at every timestamp. -''' def tpl(bpl, fpl, e): + ''' + Calculate the total privacy loss at every timestamp. + + Parameters: + bpl - The backward privacy loss. + fpl - The forward privacy loss. + e - The privacy budget for data publishing. + Returns: + The list of total privacy loss at every timestamp. + ''' return [x + y - z for (x, y, z) in zip(bpl, fpl, e)] -''' - Calculate the temporal privacy loss at every timestamp - taking into account landmarks. - - Parameters: - e - The privacy budget for data publishing. - p_b - The transition matrix representing the backward - temporal correlations. - p_f - The transition matrix representing the forward - temporal correlations. - seq - The point sequence. - lmdks - The landmarks. - Returns: - a_b - The backward privacy loss at the current timestamp - due to the previous data releases. - a_f - The forward privacy loss at the current timestamp - due to the next data releases. - a - The total privacy loss at every timestamp - taking into account landmarks. -''' def tpl_lmdk_mem(e, p_b, p_f, seq, lmdks): + ''' + Calculate the temporal privacy loss at every timestamp + taking into account landmarks. + + Parameters: + e - The privacy budget for data publishing. + p_b - The transition matrix representing the backward + temporal correlations. + p_f - The transition matrix representing the forward + temporal correlations. + seq - The point sequence. + lmdks - The landmarks. + Returns: + a_b - The backward privacy loss at the current timestamp + due to the previous data releases. + a_f - The forward privacy loss at the current timestamp + due to the next data releases. + a - The total privacy loss at every timestamp + taking into account landmarks. + ''' a_b = np.zeros(len(seq)) a_f = np.zeros(len(seq)) a = np.zeros(len(seq)) @@ -1135,18 +1136,18 @@ def tpl_lmdk_mem(e, p_b, p_f, seq, lmdks): return a_b, a_f, a -''' - Get the limits for the calculation of temporal privacy loss. - - Parameters: - t - The current timestamp. - seq - The point sequence. - lmdks - The landmarks. - Returns: - t_prv - The previous landmark. - t_nxt - The next landmark. -''' def get_limits(t, seq, lmdks): + ''' + Get the limits for the calculation of temporal privacy loss. + + Parameters: + t - The current timestamp. + seq - The point sequence. + lmdks - The landmarks. + Returns: + t_prv - The previous landmark. + t_nxt - The next landmark. + ''' # Add landmark limits. seq_lmdks = np.copy(lmdks) # if seq[0] not in seq_lmdks: @@ -1174,19 +1175,19 @@ def get_limits(t, seq, lmdks): return t_prv, t_nxt -''' - Plots the privacy loss of the time series. - - Parameters: - title - The title of the plot. - e - The privacy budget for data publishing. - a_b - The backward privacy loss. - a_f - The forward privacy loss. - a - The total privacy loss. - Returns: - Nothing. -''' def plot_loss(title, e, a_b, a_f, a): + ''' + Plots the privacy loss of the time series. + + Parameters: + title - The title of the plot. + e - The privacy budget for data publishing. + a_b - The backward privacy loss. + a_f - The forward privacy loss. + a - The total privacy loss. + Returns: + Nothing. + ''' plt.rc('font', family='serif') plt.rc('font', size=10) plt.rc('text', usetex=True) @@ -1221,19 +1222,19 @@ def plot_loss(title, e, a_b, a_f, a): plt.show() -''' - Plots a comparison of the privacy loss of all models. - - Parameters: - title - The title of the plot. - a - The privacy loss of the basic model. - a_s - The privacy loss of the static model. - a_e - The privacy loss of the exponential model. - a_l - The privacy loss of the linear model. - Returns: - Nothing. -''' def cmp_loss(title, a, a_s, a_e, a_l): + ''' + Plots a comparison of the privacy loss of all models. + + Parameters: + title - The title of the plot. + a - The privacy loss of the basic model. + a_s - The privacy loss of the static model. + a_e - The privacy loss of the exponential model. + a_l - The privacy loss of the linear model. + Returns: + Nothing. + ''' plt.rc('font', family='serif') plt.rc('font', size=10) plt.rc('text', usetex=True) @@ -1268,23 +1269,23 @@ def cmp_loss(title, a, a_s, a_e, a_l): plt.show() -''' - Parse arguments. - - Mandatory: - model, The model to be used: (b)asic, - (s)tatic, (l)inear, (e)xponential. - - Optional: - -c, --correlation, The correlation degree. - -d, --debug, Enable debugging mode. - -e, --epsilon, The available privacy budget. - -m, --matrix, The size of the transition matrix. - -o, --output, The path to the output directory. - -t, --time, The time limit. - -w, --window, The size of the event protection window. -''' def parse_args(): + ''' + Parse arguments. + + Mandatory: + model, The model to be used: (b)asic, + (s)tatic, (l)inear, (e)xponential. + + Optional: + -c, --correlation, The correlation degree. + -d, --debug, Enable debugging mode. + -e, --epsilon, The available privacy budget. + -m, --matrix, The size of the transition matrix. + -o, --output, The path to the output directory. + -t, --time, The time limit. + -w, --window, The size of the event protection window. + ''' # Create argument parser. parser = argparse.ArgumentParser()