Merge branch 'master' of git.delkappa.com:manos/the-last-thing

This commit is contained in:
Manos Katsomallos 2021-10-19 03:44:04 +02:00
commit b93328bcea
4 changed files with 596 additions and 595 deletions

View File

@ -28,15 +28,15 @@ MISS = 0 # Number of additions to the cache.
TOTAL = 0 # Number of cache accesses. TOTAL = 0 # Number of cache accesses.
''' def load_data(path):
'''
Read data from a file. Read data from a file.
Parameters: Parameters:
path - The relative path to the data file. path - The relative path to the data file.
Returns: Returns:
data - A list of tuples [uid, timestamp, lng, lat, loc]. data - A list of tuples [uid, timestamp, lng, lat, loc].
''' '''
def load_data(path):
print('Loading data from', os.path.abspath(path), '... ', end='') print('Loading data from', os.path.abspath(path), '... ', end='')
data = [] data = []
try: try:
@ -50,7 +50,8 @@ def load_data(path):
exit() exit()
''' def save_output(path, t, e, a_b, a_f, a):
'''
Save output to a file. Save output to a file.
Parameters: Parameters:
@ -62,8 +63,7 @@ def load_data(path):
a - The temporal privacy loss at each timestamp. a - The temporal privacy loss at each timestamp.
Returns: Returns:
Nothing. Nothing.
''' '''
def save_output(path, t, e, a_b, a_f, a):
# timestamp = time.strftime('%Y%m%d%H%M%S') # timestamp = time.strftime('%Y%m%d%H%M%S')
print('Saving output to %s... ' %(path), end='', flush=True) print('Saving output to %s... ' %(path), end='', flush=True)
os.makedirs(os.path.dirname(path), exist_ok=True) os.makedirs(os.path.dirname(path), exist_ok=True)
@ -74,15 +74,15 @@ def save_output(path, t, e, a_b, a_f, a):
print('OK.', flush=True) print('OK.', flush=True)
''' def get_timestamps(data):
'''
Get all the timestamps from the input data. Get all the timestamps from the input data.
Parameters: Parameters:
data - The input data set. data - The input data set.
Returns: Returns:
timestamps - An ndarray of all of the timestamps from the input data. timestamps - An ndarray of all of the timestamps from the input data.
''' '''
def get_timestamps(data):
print('Getting a list of all timestamps... ', end='', flush=True) print('Getting a list of all timestamps... ', end='', flush=True)
timestamps = np.sort(np.unique(np.array(data)[:, 1])) timestamps = np.sort(np.unique(np.array(data)[:, 1]))
if not len(timestamps): if not len(timestamps):
@ -103,15 +103,15 @@ def get_timestamps(data):
return timestamps return timestamps
''' def get_locs(data):
'''
Get all the unique locations from the input data. Get all the unique locations from the input data.
Parameters: Parameters:
data - The input data set. data - The input data set.
Returns: Returns:
locs - A sorted ndarray of all the unique locations int the input data. locs - A sorted ndarray of all the unique locations int the input data.
''' '''
def get_locs(data):
print('Getting a list of all locations... ', end='', flush=True) print('Getting a list of all locations... ', end='', flush=True)
locs = np.sort(np.unique(np.array(data)[:, 4].astype(np.int))) locs = np.sort(np.unique(np.array(data)[:, 4].astype(np.int)))
if not len(locs): if not len(locs):
@ -123,7 +123,8 @@ def get_locs(data):
return list(map(str, locs)) return list(map(str, locs))
''' def get_cnts(data, t):
'''
Get the counts at every location for a specific timestamp. Get the counts at every location for a specific timestamp.
Parameters: Parameters:
@ -131,8 +132,7 @@ def get_locs(data):
t - The timestamp of interest. t - The timestamp of interest.
Returns: Returns:
cnts - A dict {loc:cnt} with the counts at every location for a specific timestamp. cnts - A dict {loc:cnt} with the counts at every location for a specific timestamp.
''' '''
def get_cnts(data, t):
print('Getting all counts at %s... ' %(t), end='', flush=True) print('Getting all counts at %s... ' %(t), end='', flush=True)
locs = get_locs(data) locs = get_locs(data)
cnts = dict.fromkeys(locs, 0) cnts = dict.fromkeys(locs, 0)
@ -145,15 +145,15 @@ def get_cnts(data, t):
return cnts return cnts
''' def get_all_cnts(data):
'''
Get the counts at every location for every timestamp. Get the counts at every location for every timestamp.
Parameters: Parameters:
data - The input data set. data - The input data set.
Returns: Returns:
cnts - A dict {timestamp:loc} with all the counts at every location for every timestamp. cnts - A dict {timestamp:loc} with all the counts at every location for every timestamp.
''' '''
def get_all_cnts(data):
cnts = {} cnts = {}
for d in data: for d in data:
key = d[1] + '@' + d[4] key = d[1] + '@' + d[4]
@ -163,15 +163,15 @@ def get_all_cnts(data):
return cnts return cnts
''' def get_usrs(data):
'''
Get a list of unique users in the input data set. Get a list of unique users in the input data set.
Parameters: Parameters:
data - The input data set. data - The input data set.
Returns: Returns:
users - An ndarray of all unique users. users - An ndarray of all unique users.
''' '''
def get_usrs(data):
users = np.sort(np.unique(np.array(data)[:, 0].astype(np.int))) users = np.sort(np.unique(np.array(data)[:, 0].astype(np.int)))
if not len(users): if not len(users):
print('No users found.') print('No users found.')
@ -181,7 +181,8 @@ def get_usrs(data):
return users return users
''' def get_usr_data(data, id):
'''
Get the data of a particular user from a data set. Get the data of a particular user from a data set.
Parameters: Parameters:
@ -189,8 +190,7 @@ def get_usrs(data):
id - The user identifier. id - The user identifier.
Returns: Returns:
output - A list of the data of the targeted user. output - A list of the data of the targeted user.
''' '''
def get_usr_data(data, id):
output = [] output = []
for d in data: for d in data:
if (d[0] == str(id)): if (d[0] == str(id)):
@ -200,30 +200,30 @@ def get_usr_data(data, id):
return output return output
''' def get_usrs_data(data):
'''
Get the data of every user in a data set. Get the data of every user in a data set.
Parameters: Parameters:
data - The input data set. data - The input data set.
Returns: Returns:
output - A dict {usr, [usr_data]} with the data of each user. output - A dict {usr, [usr_data]} with the data of each user.
''' '''
def get_usrs_data(data):
output = {} output = {}
for d in data: for d in data:
output[d[0]] = output.get(d[0], []) + [d] output[d[0]] = output.get(d[0], []) + [d]
return output return output
''' def get_usr_traj(data):
'''
Get the trajectory of a user from her data. Get the trajectory of a user from her data.
Parameters: Parameters:
data - The data of the user. data - The data of the user.
Returns: Returns:
traj - A list [(timestamp, loc)] with the locations and corresponding timestamps that the user was at. traj - A list [(timestamp, loc)] with the locations and corresponding timestamps that the user was at.
''' '''
def get_usr_traj(data):
traj = [] traj = []
for d in data: for d in data:
traj.append((d[1], d[4])) traj.append((d[1], d[4]))
@ -232,15 +232,15 @@ def get_usr_traj(data):
return traj return traj
''' def get_poss_trans(data):
'''
Get all the possible transitions. Get all the possible transitions.
Parameters: Parameters:
data - The input data set. data - The input data set.
Returns: Returns:
trans - A set with all the possible forward transitions in the input. trans - A set with all the possible forward transitions in the input.
''' '''
def get_poss_trans(data):
print('Getting possible transitions... ', end='', flush=True) print('Getting possible transitions... ', end='', flush=True)
trans = set() trans = set()
for u, u_data in data.items(): for u, u_data in data.items():
@ -253,7 +253,8 @@ def get_poss_trans(data):
return trans return trans
''' def get_bwd_trans(data):
'''
Get all backward transitions in a data set. Get all backward transitions in a data set.
Parameters: Parameters:
@ -261,8 +262,7 @@ def get_poss_trans(data):
Returns: Returns:
trans - A dict {(t, t-1):[transitions]} with all the backward transitions trans - A dict {(t, t-1):[transitions]} with all the backward transitions
at every sequential timestamp pair in the input data set. at every sequential timestamp pair in the input data set.
''' '''
def get_bwd_trans(data):
print('Getting all backward transitions... ', end='', flush=True) print('Getting all backward transitions... ', end='', flush=True)
trans = {} trans = {}
for u, u_data in data.items(): for u, u_data in data.items():
@ -276,7 +276,8 @@ def get_bwd_trans(data):
return trans return trans
''' def get_fwd_trans(data):
'''
Get all forward transitions in a data set. Get all forward transitions in a data set.
Parameters: Parameters:
@ -284,8 +285,7 @@ def get_bwd_trans(data):
Returns: Returns:
trans - A dict {(t-1, t):[transitions]} with all the forward transitions trans - A dict {(t-1, t):[transitions]} with all the forward transitions
at every sequential timestamp pair in the input data set. at every sequential timestamp pair in the input data set.
''' '''
def get_fwd_trans(data):
print('Getting all forward transitions... ', end='', flush=True) print('Getting all forward transitions... ', end='', flush=True)
trans = {} trans = {}
for u, u_data in data.items(): for u, u_data in data.items():
@ -299,7 +299,8 @@ def get_fwd_trans(data):
return trans return trans
''' def safe_div(a, b):
'''
Divide two numbers. If the divisor is 0 return inf. Divide two numbers. If the divisor is 0 return inf.
Parameters: Parameters:
@ -307,14 +308,14 @@ def get_fwd_trans(data):
b - The divisor. b - The divisor.
Returns: Returns:
The float result of the division. The float result of the division.
''' '''
def safe_div(a, b):
if b == 0: if b == 0:
return math.inf return math.inf
return float(a/b) return float(a/b)
''' def max_val(q, d, a):
'''
Calculate the maximum value of the objective function. Calculate the maximum value of the objective function.
Parameters: Parameters:
@ -324,14 +325,14 @@ def safe_div(a, b):
timestamp. timestamp.
Returns: Returns:
The maximum value of the objective function. The maximum value of the objective function.
''' '''
def max_val(q, d, a):
if a == math.inf: if a == math.inf:
return math.nan return math.nan
return (q*(math.exp(a) - 1) + 1)/(d*(math.exp(a) - 1) + 1) return (q*(math.exp(a) - 1) + 1)/(d*(math.exp(a) - 1) + 1)
''' def find_qd(p, a):
'''
Find two different rows (q and d) of a transition matrix (p) Find two different rows (q and d) of a transition matrix (p)
that maximize the product of the objective function and return that maximize the product of the objective function and return
their sums. their sums.
@ -344,8 +345,7 @@ def max_val(q, d, a):
Returns: Returns:
sum_q - The sum of the elements of q. sum_q - The sum of the elements of q.
sum_d - The sum of the elements of d. sum_d - The sum of the elements of d.
''' '''
def find_qd(p, a):
res = 0.0 res = 0.0
sum_q, sum_d = 0.0, 0.0 sum_q, sum_d = 0.0, 0.0
for q in p: # A row from the transition matrix. for q in p: # A row from the transition matrix.
@ -374,7 +374,8 @@ def find_qd(p, a):
return sum_q, sum_d return sum_q, sum_d
''' def gen_data(usrs, timestamps, locs):
'''
Generate data. Generate data.
Parameters: Parameters:
@ -383,8 +384,7 @@ def find_qd(p, a):
locs - The numner of locations. locs - The numner of locations.
Returns: Returns:
data - The generated data. data - The generated data.
''' '''
def gen_data(usrs, timestamps, locs):
print('Generating data... ', end='', flush=True) print('Generating data... ', end='', flush=True)
# Generate timestamps. # Generate timestamps.
ts = [] ts = []
@ -412,7 +412,8 @@ def gen_data(usrs, timestamps, locs):
return data return data
''' def gen_trans_mt(n, s):
'''
Generate a transition matrix. Generate a transition matrix.
Parameters: Parameters:
@ -422,8 +423,7 @@ def gen_data(usrs, timestamps, locs):
uniformity of each row. uniformity of each row.
Returns: Returns:
p_ - The transition matrix. p_ - The transition matrix.
''' '''
def gen_trans_mt(n, s):
if DEBUG: if DEBUG:
print('Generating transition matrix %dx%d with s = %.4f... ' %(n, n, s), end='', flush=True) print('Generating transition matrix %dx%d with s = %.4f... ' %(n, n, s), end='', flush=True)
p = np.zeros((n, n), float) p = np.zeros((n, n), float)
@ -439,7 +439,8 @@ def gen_trans_mt(n, s):
return p_ return p_
''' def get_trans_mt(locs, trans):
'''
Get the transition matrix Get the transition matrix
Parameters: Parameters:
@ -448,8 +449,7 @@ def gen_trans_mt(n, s):
Returns: Returns:
p - A 2d dict {{locs}{locs}} containing the p - A 2d dict {{locs}{locs}} containing the
corresponding location transition probabilities. corresponding location transition probabilities.
''' '''
def get_trans_mt(locs, trans):
if DEBUG: if DEBUG:
print('Generating the transition matrix... ', end='', flush=True) print('Generating the transition matrix... ', end='', flush=True)
# Initialize the transition matrix. # Initialize the transition matrix.
@ -476,7 +476,8 @@ def get_trans_mt(locs, trans):
return p return p
''' def get_entropy(mt):
'''
Calculate the measure-theoretic (Kolmogorov-Sinai) entropy Calculate the measure-theoretic (Kolmogorov-Sinai) entropy
of a transition matrix. of a transition matrix.
@ -484,8 +485,7 @@ def get_trans_mt(locs, trans):
mt - A 2d dict transition matrix. mt - A 2d dict transition matrix.
Returns: Returns:
h - The Kolmogorov-Sinai entropy of the matrix. h - The Kolmogorov-Sinai entropy of the matrix.
''' '''
def get_entropy(mt):
if DEBUG: if DEBUG:
print('Calculating the measure-theoretic entropy... ', end='', flush=True) print('Calculating the measure-theoretic entropy... ', end='', flush=True)
h = 0.0 h = 0.0
@ -523,15 +523,15 @@ def get_entropy(mt):
return h return h
''' def get_2darray(mt):
'''
Convert a 2d dict to a 2d array. Convert a 2d dict to a 2d array.
Parameters: Parameters:
mt - The 2d dict. mt - The 2d dict.
Returns: Returns:
p - The 2d numpy array. p - The 2d numpy array.
''' '''
def get_2darray(mt):
if type(mt) == type(np.array([])): if type(mt) == type(np.array([])):
return mt return mt
p = np.zeros((len(mt), len(mt)), float) p = np.zeros((len(mt), len(mt)), float)
@ -540,7 +540,8 @@ def get_2darray(mt):
return p return p
''' def get_laplace_pd(ts, t, sc):
'''
Get a Laplace probability distribution. Get a Laplace probability distribution.
Parameters: Parameters:
@ -549,14 +550,14 @@ def get_2darray(mt):
sc - The scale of the distribution. sc - The scale of the distribution.
Returns: Returns:
The probability distribution. The probability distribution.
''' '''
def get_laplace_pd(ts, t, sc):
x = np.arange(0, len(ts), 1) x = np.arange(0, len(ts), 1)
loc = np.where(ts == t) loc = np.where(ts == t)
return laplace.pdf(x, loc=loc, scale=sc)[0] return laplace.pdf(x, loc=loc, scale=sc)[0]
''' def get_norm_pd(ts, t, sc):
'''
Get a Gaussian probability distribution. Get a Gaussian probability distribution.
Parameters: Parameters:
@ -565,14 +566,14 @@ def get_laplace_pd(ts, t, sc):
sc - The scale of the distribution. sc - The scale of the distribution.
Returns: Returns:
The probability distribution. The probability distribution.
''' '''
def get_norm_pd(ts, t, sc):
x = np.arange(0, len(ts), 1) x = np.arange(0, len(ts), 1)
loc = np.where(ts == t) loc = np.where(ts == t)
return norm.pdf(x, loc=loc, scale=sc)[0] return norm.pdf(x, loc=loc, scale=sc)[0]
''' def get_sample(ts, t, pct, pd):
'''
Get a sample from the time series. Get a sample from the time series.
Parameters: Parameters:
@ -583,8 +584,7 @@ def get_norm_pd(ts, t, sc):
of the probability distribution to be sampled. of the probability distribution to be sampled.
Returns: Returns:
spl - An ndarray of the sampled timestamps. spl - An ndarray of the sampled timestamps.
''' '''
def get_sample(ts, t, pct, pd):
if DEBUG: if DEBUG:
print('Sampling %.2f%% of %s at %s... ' %(pct*100, ts, t), end='', flush=True) print('Sampling %.2f%% of %s at %s... ' %(pct*100, ts, t), end='', flush=True)
# Check that it is a valid timestamp. # Check that it is a valid timestamp.
@ -604,7 +604,8 @@ def get_sample(ts, t, pct, pd):
return spl return spl
''' def priv_l(p, a, e):
'''
Calculate the backward/forward privacy loss at the current Calculate the backward/forward privacy loss at the current
timestamp. timestamp.
@ -616,13 +617,13 @@ def get_sample(ts, t, pct, pd):
Returns: Returns:
The backward/forward privacy loss at the current The backward/forward privacy loss at the current
timestamp. timestamp.
''' '''
def priv_l(p, a, e):
sum_q, sum_d = find_qd(p, a) sum_q, sum_d = find_qd(p, a)
return math.log(max_val(sum_q, sum_d, a)) + e return math.log(max_val(sum_q, sum_d, a)) + e
''' def priv_l_m(p, a, e):
'''
Calculate the backward/forward privacy loss at the current Calculate the backward/forward privacy loss at the current
timestamp using memoization. timestamp using memoization.
@ -634,8 +635,7 @@ def priv_l(p, a, e):
Returns: Returns:
The backward/forward privacy loss at the current The backward/forward privacy loss at the current
timestamp. timestamp.
''' '''
def priv_l_m(p, a, e):
key = xxhash.xxh64(p).hexdigest() + str(a) + str(e) key = xxhash.xxh64(p).hexdigest() + str(a) + str(e)
global MEM, TOTAL, MISS global MEM, TOTAL, MISS
TOTAL += 1 TOTAL += 1
@ -648,7 +648,8 @@ def priv_l_m(p, a, e):
return result return result
''' def bpl(p, a, e, t):
'''
Calculate the total backward privacy loss at every timestamp. Calculate the total backward privacy loss at every timestamp.
Parameters: Parameters:
@ -660,15 +661,15 @@ def priv_l_m(p, a, e):
Returns: Returns:
a - The backward privacy loss at every timestamp a - The backward privacy loss at every timestamp
due to the previous data releases. due to the previous data releases.
''' '''
def bpl(p, a, e, t):
a[0] = e[0] a[0] = e[0]
for i in range(1, t): for i in range(1, t):
a[i] = priv_l(p, a[i - 1], e[i]) a[i] = priv_l(p, a[i - 1], e[i])
return a return a
''' def bpl_m(p, a, e, t):
'''
Calculate the total backward privacy loss at the current Calculate the total backward privacy loss at the current
timestamp with memoization. timestamp with memoization.
@ -682,13 +683,13 @@ def bpl(p, a, e, t):
Returns: Returns:
a - The backward privacy loss at the current timestamp a - The backward privacy loss at the current timestamp
due to the previous data releases. due to the previous data releases.
''' '''
def bpl_m(p, a, e, t):
a[0] = e[0] a[0] = e[0]
for i in range(1, t): for i in range(1, t):
a[i] = priv_l_m(p, a[i - 1], e[i]) a[i] = priv_l_m(p, a[i - 1], e[i])
return a return a
def bpl_lmdk_mem(p, a, e, t, lmdk): def bpl_lmdk_mem(p, a, e, t, lmdk):
# t is (near) the landmark # t is (near) the landmark
if lmdk == t - 1 or t == lmdk: if lmdk == t - 1 or t == lmdk:
@ -702,7 +703,8 @@ def bpl_lmdk_mem(p, a, e, t, lmdk):
return a return a
''' def bpl_s(p, e, i, w):
'''
Calculate the total backward privacy loss at the current Calculate the total backward privacy loss at the current
timestamp using the static model, i.e., previous releases timestamp using the static model, i.e., previous releases
are grouped in a window of static size. are grouped in a window of static size.
@ -716,8 +718,7 @@ def bpl_lmdk_mem(p, a, e, t, lmdk):
Returns: Returns:
a - The backward privacy loss at the current timestamp a - The backward privacy loss at the current timestamp
due to the previous data releases. due to the previous data releases.
''' '''
def bpl_s(p, e, i, w):
if i - w > 1: if i - w > 1:
# print('bpl_s: %d - %d [%d]' %(i, i - w, w)) # print('bpl_s: %d - %d [%d]' %(i, i - w, w))
return priv_l(np.linalg.matrix_power(p, w), bpl_s(p, e, i - w, w), e[i - 1]) return priv_l(np.linalg.matrix_power(p, w), bpl_s(p, e, i - w, w), e[i - 1])
@ -729,7 +730,8 @@ def bpl_s(p, e, i, w):
return e[0] return e[0]
''' def bpl_s_m(p, e, i, w):
'''
Calculate the total backward privacy loss at the current Calculate the total backward privacy loss at the current
timestamp using the static model, i.e., previous releases timestamp using the static model, i.e., previous releases
are grouped in a window of static size, using memoization. are grouped in a window of static size, using memoization.
@ -743,8 +745,7 @@ def bpl_s(p, e, i, w):
Returns: Returns:
a - The backward privacy loss at the current timestamp a - The backward privacy loss at the current timestamp
due to the previous data releases. due to the previous data releases.
''' '''
def bpl_s_m(p, e, i, w):
if i - w > 1: if i - w > 1:
return priv_l_m(np.linalg.matrix_power(p, w), bpl_s_m(p, e, i - w, w), e[i - 1]) return priv_l_m(np.linalg.matrix_power(p, w), bpl_s_m(p, e, i - w, w), e[i - 1])
elif i - w <= 1: elif i - w <= 1:
@ -753,7 +754,8 @@ def bpl_s_m(p, e, i, w):
return e[0] return e[0]
''' def bpl_l(p, e, i, w, l):
'''
Calculate the total backward privacy loss at the current Calculate the total backward privacy loss at the current
timestamp using the linear model, i.e., previous releases timestamp using the linear model, i.e., previous releases
are grouped in a window of a size that increases linearly. are grouped in a window of a size that increases linearly.
@ -769,8 +771,7 @@ def bpl_s_m(p, e, i, w):
Returns: Returns:
a - The backward privacy loss at the current timestamp a - The backward privacy loss at the current timestamp
due to the previous data releases. due to the previous data releases.
''' '''
def bpl_l(p, e, i, w, l):
if i - w*l > 1: if i - w*l > 1:
# print('bpl_l: %d - %d [%d]' %(i, i - w*l, w*l)) # print('bpl_l: %d - %d [%d]' %(i, i - w*l, w*l))
return priv_l(np.linalg.matrix_power(p, w*l), bpl_l(p, e, i - w*l, w, l + 1), e[i - 1]) return priv_l(np.linalg.matrix_power(p, w*l), bpl_l(p, e, i - w*l, w, l + 1), e[i - 1])
@ -782,7 +783,8 @@ def bpl_l(p, e, i, w, l):
return e[0] return e[0]
''' def bpl_l_m(p, e, i, w, l):
'''
Calculate the total backward privacy loss at the current Calculate the total backward privacy loss at the current
timestamp using the linear model, i.e., previous releases timestamp using the linear model, i.e., previous releases
are grouped in a window of a size that increases linearly, are grouped in a window of a size that increases linearly,
@ -799,8 +801,7 @@ def bpl_l(p, e, i, w, l):
Returns: Returns:
a - The backward privacy loss at the current timestamp a - The backward privacy loss at the current timestamp
due to the previous data releases. due to the previous data releases.
''' '''
def bpl_l_m(p, e, i, w, l):
if i - w*l > 1: if i - w*l > 1:
return priv_l_m(np.linalg.matrix_power(p, w*l), bpl_l_m(p, e, i - w*l, w, l + 1), e[i - 1]) return priv_l_m(np.linalg.matrix_power(p, w*l), bpl_l_m(p, e, i - w*l, w, l + 1), e[i - 1])
elif i - w*l <= 1: elif i - w*l <= 1:
@ -809,7 +810,8 @@ def bpl_l_m(p, e, i, w, l):
return e[0] return e[0]
''' def bpl_e(p, e, i, w, h):
'''
Calculate the total backward privacy loss at the current Calculate the total backward privacy loss at the current
timestamp using the exponential model, i.e., previous releases timestamp using the exponential model, i.e., previous releases
are grouped in a window of a size that increases exponentially. are grouped in a window of a size that increases exponentially.
@ -825,8 +827,7 @@ def bpl_l_m(p, e, i, w, l):
Returns: Returns:
a - The backward privacy loss at the current timestamp a - The backward privacy loss at the current timestamp
due to the previous data releases. due to the previous data releases.
''' '''
def bpl_e(p, e, i, w, h):
if i - w**h > 1: if i - w**h > 1:
# print('bpl_e: %d - %d [%d]' %(i, i - w**h, w**h)) # print('bpl_e: %d - %d [%d]' %(i, i - w**h, w**h))
return priv_l(np.linalg.matrix_power(p, w**h), bpl_e(p, e, i - w**h, w, h + 1), e[i - 1]) return priv_l(np.linalg.matrix_power(p, w**h), bpl_e(p, e, i - w**h, w, h + 1), e[i - 1])
@ -838,7 +839,8 @@ def bpl_e(p, e, i, w, h):
return e[0] return e[0]
''' def bpl_e_m(p, e, i, w, h):
'''
Calculate the total backward privacy loss at the current Calculate the total backward privacy loss at the current
timestamp using the exponential model, i.e., previous releases timestamp using the exponential model, i.e., previous releases
are grouped in a window of a size that increases exponentially, are grouped in a window of a size that increases exponentially,
@ -855,8 +857,7 @@ def bpl_e(p, e, i, w, h):
Returns: Returns:
a - The backward privacy loss at the current timestamp a - The backward privacy loss at the current timestamp
due to the previous data releases. due to the previous data releases.
''' '''
def bpl_e_m(p, e, i, w, h):
if i - w**h > 1: if i - w**h > 1:
return priv_l_m(np.linalg.matrix_power(p, w**h), bpl_e_m(p, e, i - w**h, w, h + 1), e[i - 1]) return priv_l_m(np.linalg.matrix_power(p, w**h), bpl_e_m(p, e, i - w**h, w, h + 1), e[i - 1])
elif i - w**h <= 1: elif i - w**h <= 1:
@ -865,7 +866,8 @@ def bpl_e_m(p, e, i, w, h):
return e[0] return e[0]
''' def fpl(p, a, e, t):
'''
Calculate the total forward privacy loss at the current Calculate the total forward privacy loss at the current
timestamp. timestamp.
@ -879,15 +881,15 @@ def bpl_e_m(p, e, i, w, h):
Returns: Returns:
a - The forward privacy loss at the current timestamp a - The forward privacy loss at the current timestamp
due to the next data releases. due to the next data releases.
''' '''
def fpl(p, a, e, t):
a[t - 1] = e[t - 1] a[t - 1] = e[t - 1]
for i in range(t - 2, -1, -1): for i in range(t - 2, -1, -1):
a[i] = priv_l(p, a[i + 1], e[i]) a[i] = priv_l(p, a[i + 1], e[i])
return a return a
''' def fpl_m(p, a, e, t):
'''
Calculate the total forward privacy loss at the current Calculate the total forward privacy loss at the current
timestamp, using memoization. timestamp, using memoization.
@ -901,8 +903,7 @@ def fpl(p, a, e, t):
Returns: Returns:
a - The forward privacy loss at the current timestamp a - The forward privacy loss at the current timestamp
due to the next data releases. due to the next data releases.
''' '''
def fpl_m(p, a, e, t):
a[t - 1] = e[t - 1] a[t - 1] = e[t - 1]
for i in range(t - 2, -1, -1): for i in range(t - 2, -1, -1):
a[i] = priv_l_m(p, a[i + 1], e[i]) a[i] = priv_l_m(p, a[i + 1], e[i])
@ -921,7 +922,8 @@ def fpl_lmdk_mem(p, a, e, t, lmdk):
return a return a
''' def fpl_s(p, e, i, t, w):
'''
Calculate the total forward privacy loss at the current Calculate the total forward privacy loss at the current
timestamp using the static model, i.e., next releases timestamp using the static model, i.e., next releases
are grouped in a window of static size. are grouped in a window of static size.
@ -935,8 +937,7 @@ def fpl_lmdk_mem(p, a, e, t, lmdk):
Returns: Returns:
a - The forward privacy loss at the current timestamp a - The forward privacy loss at the current timestamp
due to the next data releases. due to the next data releases.
''' '''
def fpl_s(p, e, i, t, w):
if i + w < t: if i + w < t:
# print('fpl_s: %d - %d [%d]' %(i, i + w, w)) # print('fpl_s: %d - %d [%d]' %(i, i + w, w))
return priv_l(np.linalg.matrix_power(p, w), fpl_s(p, e, i + w, t, w), e[i - 1]) return priv_l(np.linalg.matrix_power(p, w), fpl_s(p, e, i + w, t, w), e[i - 1])
@ -948,7 +949,8 @@ def fpl_s(p, e, i, t, w):
return e[t - 1] return e[t - 1]
''' def fpl_s_m(p, e, i, t, w):
'''
Calculate the total forward privacy loss at the current Calculate the total forward privacy loss at the current
timestamp using the static model, i.e., next releases timestamp using the static model, i.e., next releases
are grouped in a window of static size, using memoization. are grouped in a window of static size, using memoization.
@ -962,8 +964,7 @@ def fpl_s(p, e, i, t, w):
Returns: Returns:
a - The forward privacy loss at the current timestamp a - The forward privacy loss at the current timestamp
due to the next data releases. due to the next data releases.
''' '''
def fpl_s_m(p, e, i, t, w):
if i + w < t: if i + w < t:
return priv_l_m(np.linalg.matrix_power(p, w), fpl_s_m(p, e, i + w, t, w), e[i - 1]) return priv_l_m(np.linalg.matrix_power(p, w), fpl_s_m(p, e, i + w, t, w), e[i - 1])
elif i + w >= t: elif i + w >= t:
@ -972,7 +973,8 @@ def fpl_s_m(p, e, i, t, w):
return e[t - 1] return e[t - 1]
''' def fpl_l(p, e, i, t, w, l):
'''
Calculate the total forward privacy loss at the current Calculate the total forward privacy loss at the current
timestamp using the linear model, i.e., next releases timestamp using the linear model, i.e., next releases
are grouped in a window of a size that increases linearly. are grouped in a window of a size that increases linearly.
@ -988,8 +990,7 @@ def fpl_s_m(p, e, i, t, w):
Returns: Returns:
a - The forward privacy loss at the current timestamp a - The forward privacy loss at the current timestamp
due to the next data releases. due to the next data releases.
''' '''
def fpl_l(p, e, i, t, w, l):
if i + w*l < t: if i + w*l < t:
# print('fpl_l: %d - %d [%d]' %(i, i + w*l, w*l)) # print('fpl_l: %d - %d [%d]' %(i, i + w*l, w*l))
return priv_l(np.linalg.matrix_power(p, w*l), fpl_l(p, e, i + w*l, t, w, l + 1), e[i - 1]) return priv_l(np.linalg.matrix_power(p, w*l), fpl_l(p, e, i + w*l, t, w, l + 1), e[i - 1])
@ -1001,7 +1002,8 @@ def fpl_l(p, e, i, t, w, l):
return e[t - 1] return e[t - 1]
''' def fpl_l_m(p, e, i, t, w, l):
'''
Calculate the total forward privacy loss at the current Calculate the total forward privacy loss at the current
timestamp using the linear model, i.e., next releases timestamp using the linear model, i.e., next releases
are grouped in a window of a size that increases linearly, are grouped in a window of a size that increases linearly,
@ -1018,8 +1020,7 @@ def fpl_l(p, e, i, t, w, l):
Returns: Returns:
a - The forward privacy loss at the current timestamp a - The forward privacy loss at the current timestamp
due to the next data releases. due to the next data releases.
''' '''
def fpl_l_m(p, e, i, t, w, l):
if i + w*l < t: if i + w*l < t:
return priv_l_m(np.linalg.matrix_power(p, w*l), fpl_l_m(p, e, i + w*l, t, w, l + 1), e[i - 1]) return priv_l_m(np.linalg.matrix_power(p, w*l), fpl_l_m(p, e, i + w*l, t, w, l + 1), e[i - 1])
elif i + w*l >= t: elif i + w*l >= t:
@ -1028,7 +1029,8 @@ def fpl_l_m(p, e, i, t, w, l):
return e[t - 1] return e[t - 1]
''' def fpl_e(p, e, i, t, w, h):
'''
Calculate the total forward privacy loss at the current Calculate the total forward privacy loss at the current
timestamp using the exponential model, i.e., next releases timestamp using the exponential model, i.e., next releases
are grouped in a window of a size that increases exponentially. are grouped in a window of a size that increases exponentially.
@ -1044,8 +1046,7 @@ def fpl_l_m(p, e, i, t, w, l):
Returns: Returns:
a - The forward privacy loss at the current timestamp a - The forward privacy loss at the current timestamp
due to the next data releases. due to the next data releases.
''' '''
def fpl_e(p, e, i, t, w, h):
if i + w**h < t: if i + w**h < t:
# print('fpl_e: %d - %d [%d]' %(i, i + w**h, w**h)) # print('fpl_e: %d - %d [%d]' %(i, i + w**h, w**h))
return priv_l(np.linalg.matrix_power(p, w**h), fpl_e(p, e, i + w**h, t, w, h + 1), e[i - 1]) return priv_l(np.linalg.matrix_power(p, w**h), fpl_e(p, e, i + w**h, t, w, h + 1), e[i - 1])
@ -1057,7 +1058,8 @@ def fpl_e(p, e, i, t, w, h):
return e[t - 1] return e[t - 1]
''' def fpl_e_m(p, e, i, t, w, h):
'''
Calculate the total forward privacy loss at the current Calculate the total forward privacy loss at the current
timestamp using the exponential model, i.e., next releases timestamp using the exponential model, i.e., next releases
are grouped in a window of a size that increases exponentially, are grouped in a window of a size that increases exponentially,
@ -1074,8 +1076,7 @@ def fpl_e(p, e, i, t, w, h):
Returns: Returns:
a - The forward privacy loss at the current timestamp a - The forward privacy loss at the current timestamp
due to the next data releases. due to the next data releases.
''' '''
def fpl_e_m(p, e, i, t, w, h):
if i + w**h < t: if i + w**h < t:
return priv_l_m(np.linalg.matrix_power(p, w**h), fpl_e_m(p, e, i + w**h, t, w, h + 1), e[i - 1]) return priv_l_m(np.linalg.matrix_power(p, w**h), fpl_e_m(p, e, i + w**h, t, w, h + 1), e[i - 1])
elif i + w**h >= t: elif i + w**h >= t:
@ -1084,7 +1085,8 @@ def fpl_e_m(p, e, i, t, w, h):
return e[t - 1] return e[t - 1]
''' def tpl(bpl, fpl, e):
'''
Calculate the total privacy loss at every timestamp. Calculate the total privacy loss at every timestamp.
Parameters: Parameters:
@ -1093,12 +1095,12 @@ def fpl_e_m(p, e, i, t, w, h):
e - The privacy budget for data publishing. e - The privacy budget for data publishing.
Returns: Returns:
The list of total privacy loss at every timestamp. The list of total privacy loss at every timestamp.
''' '''
def tpl(bpl, fpl, e):
return [x + y - z for (x, y, z) in zip(bpl, fpl, e)] return [x + y - z for (x, y, z) in zip(bpl, fpl, e)]
''' def tpl_lmdk_mem(e, p_b, p_f, seq, lmdks):
'''
Calculate the temporal privacy loss at every timestamp Calculate the temporal privacy loss at every timestamp
taking into account landmarks. taking into account landmarks.
@ -1117,8 +1119,7 @@ def tpl(bpl, fpl, e):
due to the next data releases. due to the next data releases.
a - The total privacy loss at every timestamp a - The total privacy loss at every timestamp
taking into account landmarks. taking into account landmarks.
''' '''
def tpl_lmdk_mem(e, p_b, p_f, seq, lmdks):
a_b = np.zeros(len(seq)) a_b = np.zeros(len(seq))
a_f = np.zeros(len(seq)) a_f = np.zeros(len(seq))
a = np.zeros(len(seq)) a = np.zeros(len(seq))
@ -1135,7 +1136,8 @@ def tpl_lmdk_mem(e, p_b, p_f, seq, lmdks):
return a_b, a_f, a return a_b, a_f, a
''' def get_limits(t, seq, lmdks):
'''
Get the limits for the calculation of temporal privacy loss. Get the limits for the calculation of temporal privacy loss.
Parameters: Parameters:
@ -1145,8 +1147,7 @@ def tpl_lmdk_mem(e, p_b, p_f, seq, lmdks):
Returns: Returns:
t_prv - The previous landmark. t_prv - The previous landmark.
t_nxt - The next landmark. t_nxt - The next landmark.
''' '''
def get_limits(t, seq, lmdks):
# Add landmark limits. # Add landmark limits.
seq_lmdks = np.copy(lmdks) seq_lmdks = np.copy(lmdks)
# if seq[0] not in seq_lmdks: # if seq[0] not in seq_lmdks:
@ -1174,7 +1175,8 @@ def get_limits(t, seq, lmdks):
return t_prv, t_nxt return t_prv, t_nxt
''' def plot_loss(title, e, a_b, a_f, a):
'''
Plots the privacy loss of the time series. Plots the privacy loss of the time series.
Parameters: Parameters:
@ -1185,8 +1187,7 @@ def get_limits(t, seq, lmdks):
a - The total privacy loss. a - The total privacy loss.
Returns: Returns:
Nothing. Nothing.
''' '''
def plot_loss(title, e, a_b, a_f, a):
plt.rc('font', family='serif') plt.rc('font', family='serif')
plt.rc('font', size=10) plt.rc('font', size=10)
plt.rc('text', usetex=True) plt.rc('text', usetex=True)
@ -1221,7 +1222,8 @@ def plot_loss(title, e, a_b, a_f, a):
plt.show() plt.show()
''' def cmp_loss(title, a, a_s, a_e, a_l):
'''
Plots a comparison of the privacy loss of all models. Plots a comparison of the privacy loss of all models.
Parameters: Parameters:
@ -1232,8 +1234,7 @@ def plot_loss(title, e, a_b, a_f, a):
a_l - The privacy loss of the linear model. a_l - The privacy loss of the linear model.
Returns: Returns:
Nothing. Nothing.
''' '''
def cmp_loss(title, a, a_s, a_e, a_l):
plt.rc('font', family='serif') plt.rc('font', family='serif')
plt.rc('font', size=10) plt.rc('font', size=10)
plt.rc('text', usetex=True) plt.rc('text', usetex=True)
@ -1268,7 +1269,8 @@ def cmp_loss(title, a, a_s, a_e, a_l):
plt.show() plt.show()
''' def parse_args():
'''
Parse arguments. Parse arguments.
Mandatory: Mandatory:
@ -1283,8 +1285,7 @@ def cmp_loss(title, a, a_s, a_e, a_l):
-o, --output, The path to the output directory. -o, --output, The path to the output directory.
-t, --time, The time limit. -t, --time, The time limit.
-w, --window, The size of the event protection window. -w, --window, The size of the event protection window.
''' '''
def parse_args():
# Create argument parser. # Create argument parser.
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()

View File

@ -7,4 +7,4 @@ The {\thething} selection module introduces a reasonable data utility decline to
% \kat{it would be nice to see it clearly on Figure 5.5. (eg, by including another bar that shows adaptive without landmark selection)} % \kat{it would be nice to see it clearly on Figure 5.5. (eg, by including another bar that shows adaptive without landmark selection)}
% \mk{Done.} % \mk{Done.}
In terms of temporal correlation, we observe that under moderate and strong temporal correlation, a greater average regular--{\thething} event distance in a {\thething} distribution causes greater overall privacy loss. In terms of temporal correlation, we observe that under moderate and strong temporal correlation, a greater average regular--{\thething} event distance in a {\thething} distribution causes greater overall privacy loss.
Finally, the contribution of the {\thething} privacy on enhancing the data utility, while preserving $\epsilon$-differential privacy, is demonstrated by the fact that the selected Adaptive scheme provides better data utility than the user-level privacy protection. Finally, the contribution of the {\thething} privacy on enhancing the data utility, while preserving $\varepsilon$-differential privacy, is demonstrated by the fact that the selected Adaptive scheme provides better data utility than the user-level privacy protection.

View File

@ -22,7 +22,7 @@ Take for example the scenario in Figure~\ref{fig:st-cont}, where {\thethings} ar
If we want to protect the {\thething} points, we have to allocate at most a budget of $\varepsilon$ to the {\thethings}, while saving some for the release of regular events. If we want to protect the {\thething} points, we have to allocate at most a budget of $\varepsilon$ to the {\thethings}, while saving some for the release of regular events.
Essentially, the more budget we allocate to an event the less we protect it, but at the same time we maintain its utility. Essentially, the more budget we allocate to an event the less we protect it, but at the same time we maintain its utility.
With {\thething} privacy we propose to distribute the budget taking into account only the existence of the {\thethings} when we release an event of the time series, i.e.,~allocating $\frac{\varepsilon}{5}$ ($4\ \text{\thethings} + 1\ \text{regular point}$) to each event (see Figure~\ref{fig:st-cont}). With {\thething} privacy we propose to distribute the budget taking into account only the existence of the {\thethings} when we release an event of the time series, i.e.,~allocating $\frac{\varepsilon}{5}$ ($4\ \text{\thethings} + 1\ \text{regular point}$) to each event (see Figure~\ref{fig:st-cont}).
This way, we still guarantee\footnote{$\epsilon$-differential privacy guarantees that the allocated budget should be less or equal to $\epsilon$, and not precisely how much.\kat{Mano check.}} that the {\thethings} are adequately protected, as they receive a total budget of $\frac{4\varepsilon}{5}<\varepsilon$. This way, we still guarantee\footnote{$\varepsilon$-differential privacy guarantees that the allocated budget should be less or equal to $\varepsilon$, and not precisely how much.\kat{Mano check.}} that the {\thethings} are adequately protected, as they receive a total budget of $\frac{4\varepsilon}{5}<\varepsilon$.
At the same time, we avoid over-perturbing the regular events, as we allocate to them a higher total budget ($\frac{4\varepsilon}{5}$) compared to the user-level scenario ($\frac{\varepsilon}{2}$), and thus less noise. At the same time, we avoid over-perturbing the regular events, as we allocate to them a higher total budget ($\frac{4\varepsilon}{5}$) compared to the user-level scenario ($\frac{\varepsilon}{2}$), and thus less noise.

View File

@ -77,7 +77,7 @@ Intuitively, knowing the data set at timestamp $t$ stops the propagation of the
%\kat{do we see this in the formula 1 ?} %\kat{do we see this in the formula 1 ?}
%when calculating the forward or backward privacy loss respectively. %when calculating the forward or backward privacy loss respectively.
Cao et al.~\cite{cao2017quantifying} propose a method for computing the total temporal privacy loss $\alpha_t$ at a timestamp $t$ as the sum of the backward and forward privacy loss, $\alpha^B_t$ and $\alpha^F_t$, minus the privacy budget $\varepsilon_t$ Cao et al.~\cite{cao2017quantifying} propose a method for computing the temporal privacy loss $\alpha_t$ at a timestamp $t$ as the sum of the backward and forward privacy loss, $\alpha^B_t$ and $\alpha^F_t$, minus the privacy budget $\varepsilon_t$
to account for the extra privacy loss due to previous and next releases $\pmb{o}$ of $\mathcal{M}$ under temporal correlation. to account for the extra privacy loss due to previous and next releases $\pmb{o}$ of $\mathcal{M}$ under temporal correlation.
By Theorem~\ref{theor:thething-prv}, at every timestamp $t$ we consider the data at $t$ and at the {\thething} timestamps $L$. By Theorem~\ref{theor:thething-prv}, at every timestamp $t$ we consider the data at $t$ and at the {\thething} timestamps $L$.
%According to the Definitions~{\ref{def:bpl} and \ref{def:fpl}}, we calculate the backward and forward privacy loss by taking into account the privacy budget at previous and next data releases respectively. %According to the Definitions~{\ref{def:bpl} and \ref{def:fpl}}, we calculate the backward and forward privacy loss by taking into account the privacy budget at previous and next data releases respectively.