the-last-thing/code/lib/gdp.py

1577 lines
47 KiB
Python

#!/usr/bin/env python3
import argparse
import collections
import csv
import datetime
import math
from matplotlib import pyplot as plt
import numpy as np
import os
import random
from scipy.sparse.linalg import eigs
from scipy.stats import laplace
from scipy.stats import norm
import sympy as sp
import time
import xxhash
# Debugging
DEBUG = False # Variable to check.
# When enabled print something.
# Memoization
MEM = {} # The cache.
MISS = 0 # Number of additions to the cache.
TOTAL = 0 # Number of cache accesses.
'''
Read data from a file.
Parameters:
path - The relative path to the data file.
Returns:
data - A list of tuples [uid, timestamp, lng, lat, loc].
'''
def load_data(path):
print('Loading data from', os.path.abspath(path), '... ', end='')
data = []
try:
with open(path, 'r') as f:
reader = csv.reader(f, delimiter=',')
data = list(reader)
print('OK.')
return data
except IOError:
print('Oops!')
exit()
'''
Save output to a file.
Parameters:
path - The relative path to the output file.
t - The number of timestamps.
e - The privacy budget at each timestamp.
a_b - The backward privacy loss at each timestamp.
a_f - The forward privacy loss at each timestamp.
a - The temporal privacy loss at each timestamp.
Returns:
Nothing.
'''
def save_output(path, t, e, a_b, a_f, a):
# timestamp = time.strftime('%Y%m%d%H%M%S')
print('Saving output to %s... ' %(path), end='', flush=True)
os.makedirs(os.path.dirname(path), exist_ok=True)
with open(path, 'w') as f:
w = csv.writer(f, delimiter=';')
w.writerow(['t', 'e', 'bpl', 'fpl', 'tpl'])
w.writerows(zip(list(range(1, t + 1)), e, a_b, a_f, a))
print('OK.', flush=True)
'''
Get all the timestamps from the input data.
Parameters:
data - The input data set.
Returns:
timestamps - An ndarray of all of the timestamps from the input data.
'''
def get_timestamps(data):
print('Getting a list of all timestamps... ', end='', flush=True)
timestamps = np.sort(np.unique(np.array(data)[:, 1]))
if not len(timestamps):
print('No timestamps found.', flush=True)
exit()
# Check timestamps' order.
prev = None
for t in timestamps:
curr = time.mktime(datetime.datetime.strptime(t, '%Y-%m-%d %H:%M').timetuple())
if prev:
if curr < prev:
print('Wrong order.', flush=True)
exit()
prev = curr
print('OK.', flush=True)
if DEBUG:
print(timestamps, len(timestamps))
return timestamps
'''
Get all the unique locations from the input data.
Parameters:
data - The input data set.
Returns:
locs - A sorted ndarray of all the unique locations int the input data.
'''
def get_locs(data):
print('Getting a list of all locations... ', end='', flush=True)
locs = np.sort(np.unique(np.array(data)[:, 4].astype(np.int)))
if not len(locs):
print('No locations found.', flush=True)
exit()
print('OK.', flush=True)
if DEBUG:
print(list(map(str, locs)), len(locs))
return list(map(str, locs))
'''
Get the counts at every location for a specific timestamp.
Parameters:
data - The input data set.
t - The timestamp of interest.
Returns:
cnts - A dict {loc:cnt} with the counts at every location for a specific timestamp.
'''
def get_cnts(data, t):
print('Getting all counts at %s... ' %(t), end='', flush=True)
locs = get_locs(data)
cnts = dict.fromkeys(locs, 0)
for d in data:
if d[1] == t:
cnts[int(d[4])] += 1
print('OK.', flush=True)
if DEBUG:
print(cnts)
return cnts
'''
Get the counts at every location for every timestamp.
Parameters:
data - The input data set.
Returns:
cnts - A dict {timestamp:loc} with all the counts at every location for every timestamp.
'''
def get_all_cnts(data):
cnts = {}
for d in data:
key = d[1] + '@' + d[4]
cnts[key] = cnts.get(key, 0) + 1
if DEBUG:
print(cnts)
return cnts
'''
Get a list of unique users in the input data set.
Parameters:
data - The input data set.
Returns:
users - An ndarray of all unique users.
'''
def get_usrs(data):
users = np.sort(np.unique(np.array(data)[:, 0].astype(np.int)))
if not len(users):
print('No users found.')
exit()
if DEBUG:
print(users, len(users))
return users
'''
Get the data of a particular user from a data set.
Parameters:
data - The input data set.
id - The user identifier.
Returns:
output - A list of the data of the targeted user.
'''
def get_usr_data(data, id):
output = []
for d in data:
if (d[0] == str(id)):
output.append(d)
if DEBUG:
print(id, output)
return output
'''
Get the data of every user in a data set.
Parameters:
data - The input data set.
Returns:
output - A dict {usr, [usr_data]} with the data of each user.
'''
def get_usrs_data(data):
output = {}
for d in data:
output[d[0]] = output.get(d[0], []) + [d]
return output
'''
Get the trajectory of a user from her data.
Parameters:
data - The data of the user.
Returns:
traj - A list [(timestamp, loc)] with the locations and corresponding timestamps that the user was at.
'''
def get_usr_traj(data):
traj = []
for d in data:
traj.append((d[1], d[4]))
if DEBUG:
print(traj)
return traj
'''
Get all the possible transitions.
Parameters:
data - The input data set.
Returns:
trans - A set with all the possible forward transitions in the input.
'''
def get_poss_trans(data):
print('Getting possible transitions... ', end='', flush=True)
trans = set()
for u, u_data in data.items():
traj = get_usr_traj(u_data)
trans.update(list(zip(traj, traj[1:])))
if len(trans) == 0:
print('None.', flush=True)
exit()
print('OK.', flush=True)
return trans
'''
Get all backward transitions in a data set.
Parameters:
data - The input data set.
Returns:
trans - A dict {(t, t-1):[transitions]} with all the backward transitions
at every sequential timestamp pair in the input data set.
'''
def get_bwd_trans(data):
print('Getting all backward transitions... ', end='', flush=True)
trans = {}
for u, u_data in data.items():
traj = get_usr_traj(u_data)
for i, j in zip(traj[1:], traj):
trans[(i[0], j[0])] = trans.get((i[0], j[0]), []) + [(i[1], j[1])]
if len(trans) == 0:
print('None.', flush=True)
exit()
print('OK.', flush=True)
return trans
'''
Get all forward transitions in a data set.
Parameters:
data - The input data set.
Returns:
trans - A dict {(t-1, t):[transitions]} with all the forward transitions
at every sequential timestamp pair in the input data set.
'''
def get_fwd_trans(data):
print('Getting all forward transitions... ', end='', flush=True)
trans = {}
for u, u_data in data.items():
traj = get_usr_traj(u_data)
for i, j in zip(traj, traj[1:]):
trans[(i[0], j[0])] = trans.get((i[0], j[0]), []) + [(i[1], j[1])]
if len(trans) == 0:
print('None.', flush=True)
exit()
print('OK.', flush=True)
return trans
'''
Divide two numbers. If the divisor is 0 return inf.
Parameters:
a - The dividend.
b - The divisor.
Returns:
The float result of the division.
'''
def safe_div(a, b):
if b == 0:
return math.inf
return float(a/b)
'''
Calculate the maximum value of the objective function.
Parameters:
q - A row from the transition matrix.
d - Another row from the transition matrix.
a - The backward/forward privacy loss of the previous/next
timestamp.
Returns:
The maximum value of the objective function.
'''
def max_val(q, d, a):
if a == math.inf:
return math.nan
return (q*(math.exp(a) - 1) + 1)/(d*(math.exp(a) - 1) + 1)
'''
Find two different rows (q and d) of a transition matrix (p)
that maximize the product of the objective function and return
their sums.
Parameters:
p - The transition matrix representing the backward/forward
correlations.
a - The backward/forward privacy loss of the previous/next
timestamp.
Returns:
sum_q - The sum of the elements of q.
sum_d - The sum of the elements of d.
'''
def find_qd(p, a):
res = 0.0
sum_q, sum_d = 0.0, 0.0
for q in p: # A row from the transition matrix.
for d in p: # Another row from the transition matrix.
if not np.array_equal(q, d) and sum(q) and sum(d):
plus = [] # Indexes of the elements of q+ and d+.
for i in range(np.shape(p)[1]): # Iterate each column.
if q[i] > d[i]:
plus.append(i)
s_q, s_d = 0.0, 0.0
update = True
while update:
update = False
for i in plus:
s_q += q[i]
s_d += d[i]
max_tmp = max_val(s_q, s_d, a)
for i in plus:
if safe_div(q[i], d[i]) <= max_tmp:
plus.remove(i)
update = True
res_tmp = math.log(max_val(s_q, s_d, a))
if res < res_tmp:
res = res_tmp
sum_q, sum_d = s_q, s_d
return sum_q, sum_d
'''
Generate data.
Parameters:
usrs - The number of users.
timestamps - The number of timestamps.
locs - The numner of locations.
Returns:
data - The generated data.
'''
def gen_data(usrs, timestamps, locs):
print('Generating data... ', end='', flush=True)
# Generate timestamps.
ts = []
t_start = datetime.datetime.now()
for t in range(timestamps):
t_start += datetime.timedelta(minutes=t)
ts.append(str(t_start.strftime('%Y-%m-%d %H:%M')))
# Randomly generate unique possible transitions.
trans = set()
for i in range(locs**2):
trans.add((str(random.randint(1, locs)), str(random.randint(1, locs))))
# Generate the data.
data = []
for u in range(1, usrs + 1):
# Choose a random starting location.
l_start = random.sample(trans, 1)[0][0]
for t in ts:
# Choose a random next location.
while True:
l_nxt = str(random.randint(1, locs))
if (l_start, l_nxt) in trans:
data.append([str(u), str(t), str(l_nxt), str(l_nxt), str(l_nxt)])
break
print('OK.', flush=True)
return data
'''
Generate a transition matrix.
Parameters:
n - The dimension of the matrix.
s - The correlation degree of each row [0, 1].
The lower its value, the lower the degree of
uniformity of each row.
Returns:
p_ - The transition matrix.
'''
def gen_trans_mt(n, s):
if DEBUG:
print('Generating transition matrix %dx%d with s = %.4f... ' %(n, n, s), end='', flush=True)
p = np.zeros((n, n), float)
np.fill_diagonal(p, 1.0)
p_ = np.zeros((n, n), float)
for j, p_j in enumerate(p[:, ]):
for k, p_jk in enumerate(p_j):
p_[j, k] = round((p[j, k] + s) / (sum(p_j) + len(p_j)*s), 4)
if DEBUG:
print('OK.', flush=True)
if DEBUG:
print(p_)
return p_
'''
Get the transition matrix
Parameters:
locs - A list of all the locations.
trans - A list of all transitions.
Returns:
p - A 2d dict {{locs}{locs}} containing the
corresponding location transition probabilities.
'''
def get_trans_mt(locs, trans):
if DEBUG:
print('Generating the transition matrix... ', end='', flush=True)
# Initialize the transition matrix.
p = collections.defaultdict(dict)
for i in locs:
for j in locs:
p[i][j] = 0.0
# Populate the transition matrix.
for i in locs:
for j in locs:
# Count the transitions from i to j.
cnt = collections.Counter(trans)[(i, j)]
# Count the transitions from i to any location.
sum_cnts = len([tr for tr in trans if tr[0] == i])
res = 0.0
if cnt != 0 and sum_cnts != 0:
res = cnt/sum_cnts
# Update transition matrix.
p[i][j] = res
if DEBUG:
print('OK.', flush=True)
for i in locs:
print(p[i])
return p
'''
Calculate the measure-theoretic (Kolmogorov-Sinai) entropy
of a transition matrix.
Parameters:
mt - A 2d dict transition matrix.
Returns:
h - The Kolmogorov-Sinai entropy of the matrix.
'''
def get_entropy(mt):
if DEBUG:
print('Calculating the measure-theoretic entropy... ', end='', flush=True)
h = 0.0
# Get the 2d array of the transition matrix.
a = get_2darray(mt)
# Check if the transition matrix is empty.
if np.allclose(a, np.zeros((len(a), len(a)), float)):
return h
# Find the stationary distribution (v), written as a row
# vector, that does not change under the application of
# the transition matrix. In other words, the left eigenvector
# associated with eigenvalue (w) 1. It is also called
# Perron-Frobenius eigenvector, since it is a stochastic
# matrix and the largest magnitude of the eigenvalues must
# be 1. We use the transpose of the transition matrix to get
# the left eigenvector.
w, v = eigs(a.T, k=1, which='LM')
# Normalize the eigenvector. The vector is a probability
# distribution, and therefore its sum must be 1. Thus, we use
# its absolute elements' values and its l1-norm to normalize it.
p = np.absolute(v[:,0].real)
p_norm = np.linalg.norm(p, ord=1)
if p_norm:
p = p/p_norm
# Calculate the entropy.
if np.isclose(w.real[0], 1, atol=1e-03):
for i in range(len(a)):
for j in range(len(a)):
# When the transition probability is 0 use the limiting value (0).
if not np.isclose(a[i][j], 0, atol=1e-03) and not np.isclose(p[i], 0, atol=1e-03):
# Binary logarithm measures entropy in bits.
h += -p[i]*a[i][j]*math.log2(a[i][j])
if DEBUG:
print(h, flush=True)
return h
'''
Convert a 2d dict to a 2d array.
Parameters:
mt - The 2d dict.
Returns:
p - The 2d numpy array.
'''
def get_2darray(mt):
if type(mt) == type(np.array([])):
return mt
p = np.zeros((len(mt), len(mt)), float)
for i, mt_i in enumerate(mt.items()):
p[i] = list(mt_i[1].values())
return p
'''
Get a Laplace probability distribution.
Parameters:
ts - The points of the distribution.
t - The location of the distribution.
sc - The scale of the distribution.
Returns:
The probability distribution.
'''
def get_laplace_pd(ts, t, sc):
x = np.arange(0, len(ts), 1)
loc = np.where(ts == t)
return laplace.pdf(x, loc=loc, scale=sc)[0]
'''
Get a Gaussian probability distribution.
Parameters:
ts - The points of the distribution.
t - The location of the distribution.
sc - The scale of the distribution.
Returns:
The probability distribution.
'''
def get_norm_pd(ts, t, sc):
x = np.arange(0, len(ts), 1)
loc = np.where(ts == t)
return norm.pdf(x, loc=loc, scale=sc)[0]
'''
Get a sample from the time series.
Parameters:
ts - An ndarray of the timestamps.
t - The current timestamp.
pd - The probability distribution.
ptn - The desired portion [0, 1] of the non-zero elements
of the probability distribution to be sampled.
Returns:
spl - An ndarray of the sampled timestamps.
'''
def get_sample(ts, t, pct, pd):
if DEBUG:
print('Sampling %.2f%% of %s at %s... ' %(pct*100, ts, t), end='', flush=True)
# Check that it is a valid timestamp.
if not t in ts:
print('%s is not in the time series.' %(t))
exit()
# Remove the current timestamp from the possible sample.
i = np.where(ts == t)
ts = np.delete(ts, i)
pd = np.delete(pd, i)
# Make sure that the probabilities sum to 1.
pd_norm = pd/np.linalg.norm(pd, ord=1)
# Get the sorted sample.
spl = np.sort(np.random.choice(ts, size=int(np.count_nonzero(pd)*pct), replace=False, p=pd_norm))
if DEBUG:
print(spl, flush=True)
return spl
'''
Calculate the backward/forward privacy loss at the current
timestamp.
Parameters:
p - The transition matrix representing the backward/forward
temporal correlations.
a - The privacy loss of the previous/next timestamp.
e - The privacy budget for data publishing.
Returns:
The backward/forward privacy loss at the current
timestamp.
'''
def priv_l(p, a, e):
sum_q, sum_d = find_qd(p, a)
return math.log(max_val(sum_q, sum_d, a)) + e
'''
Calculate the backward/forward privacy loss at the current
timestamp using memoization.
Parameters:
p - The transition matrix representing the backward/forward
temporal correlations.
a - The privacy loss of the previous/next timestamp.
e - The privacy budget for data publishing.
Returns:
The backward/forward privacy loss at the current
timestamp.
'''
def priv_l_m(p, a, e):
key = xxhash.xxh64(p).hexdigest() + str(a) + str(e)
global MEM, TOTAL, MISS
TOTAL += 1
result = MEM.get(key)
if not result:
MISS += 1
sum_q, sum_d = find_qd(p, a)
result = math.log(max_val(sum_q, sum_d, a)) + e
MEM[key] = result
return result
'''
Calculate the total backward privacy loss at every timestamp.
Parameters:
p - The transition matrix representing the backward
temporal correlations.
a - The backward privacy loss of every release.
e - The privacy budget for data publishing.
t - The time limit.
Returns:
a - The backward privacy loss at every timestamp
due to the previous data releases.
'''
def bpl(p, a, e, t):
a[0] = e[0]
for i in range(1, t):
a[i] = priv_l(p, a[i - 1], e[i])
return a
'''
Calculate the total backward privacy loss at the current
timestamp with memoization.
Parameters:
p - The transition matrix representing the backward
temporal correlations.
a - The backward privacy loss of the current release
at all previous timestamps.
e - The privacy budget for data publishing.
t - The time limit.
Returns:
a - The backward privacy loss at the current timestamp
due to the previous data releases.
'''
def bpl_m(p, a, e, t):
a[0] = e[0]
for i in range(1, t):
a[i] = priv_l_m(p, a[i - 1], e[i])
return a
def bpl_lmdk_mem(p, a, e, t, lmdk):
# t is (near) the landmark
if lmdk == t - 1 or t == lmdk:
return np.array([0]*len(a))
# There is no landmark before
if lmdk < 1:
lmdk = 1
a[0] = e[0]
for i in range(lmdk, t):
a[i] = priv_l_m(p, a[i - 1], e[i])
return a
'''
Calculate the total backward privacy loss at the current
timestamp using the static model, i.e., previous releases
are grouped in a window of static size.
Parameters:
p - The transition matrix representing the backward
temporal correlations.
e - The privacy budget for data publishing.
i - The timestamp of interest.
w - The window size to group previous releases.
Returns:
a - The backward privacy loss at the current timestamp
due to the previous data releases.
'''
def bpl_s(p, e, i, w):
if i - w > 1:
# print('bpl_s: %d - %d [%d]' %(i, i - w, w))
return priv_l(np.linalg.matrix_power(p, w), bpl_s(p, e, i - w, w), e[i - 1])
elif i - w <= 1:
# print('bpl_s: %d - %d [%d]' %(i, 1, i - 1))
return priv_l(np.linalg.matrix_power(p, i - 1), e[0], e[i - 1])
elif i == 1:
# print('bpl_s: %d' %(i))
return e[0]
'''
Calculate the total backward privacy loss at the current
timestamp using the static model, i.e., previous releases
are grouped in a window of static size, using memoization.
Parameters:
p - The transition matrix representing the backward
temporal correlations.
e - The privacy budget for data publishing.
i - The timestamp of interest.
w - The window size to group previous releases.
Returns:
a - The backward privacy loss at the current timestamp
due to the previous data releases.
'''
def bpl_s_m(p, e, i, w):
if i - w > 1:
return priv_l_m(np.linalg.matrix_power(p, w), bpl_s_m(p, e, i - w, w), e[i - 1])
elif i - w <= 1:
return priv_l_m(np.linalg.matrix_power(p, i - 1), e[0], e[i - 1])
elif i == 1:
return e[0]
'''
Calculate the total backward privacy loss at the current
timestamp using the linear model, i.e., previous releases
are grouped in a window of a size that increases linearly.
Parameters:
p - The transition matrix representing the backward
temporal correlations.
e - The privacy budget for data publishing.
i - The timestamp of interest.
w - The window size to group previous releases.
l - The linearly increasing coefficient that affects the
window size.
Returns:
a - The backward privacy loss at the current timestamp
due to the previous data releases.
'''
def bpl_l(p, e, i, w, l):
if i - w*l > 1:
# print('bpl_l: %d - %d [%d]' %(i, i - w*l, w*l))
return priv_l(np.linalg.matrix_power(p, w*l), bpl_l(p, e, i - w*l, w, l + 1), e[i - 1])
elif i - w*l <= 1:
# print('bpl_l: %d - %d [%d]' %(i, 1, i - 1))
return priv_l(np.linalg.matrix_power(p, i - 1), e[0], e[i - 1])
elif i == 1:
# print('bpl_l: %d' %(1))
return e[0]
'''
Calculate the total backward privacy loss at the current
timestamp using the linear model, i.e., previous releases
are grouped in a window of a size that increases linearly,
using memoization.
Parameters:
p - The transition matrix representing the backward
temporal correlations.
e - The privacy budget for data publishing.
i - The timestamp of interest.
w - The window size to group previous releases.
l - The linearly increasing coefficient that affects the
window size.
Returns:
a - The backward privacy loss at the current timestamp
due to the previous data releases.
'''
def bpl_l_m(p, e, i, w, l):
if i - w*l > 1:
return priv_l_m(np.linalg.matrix_power(p, w*l), bpl_l_m(p, e, i - w*l, w, l + 1), e[i - 1])
elif i - w*l <= 1:
return priv_l_m(np.linalg.matrix_power(p, i - 1), e[0], e[i - 1])
elif i == 1:
return e[0]
'''
Calculate the total backward privacy loss at the current
timestamp using the exponential model, i.e., previous releases
are grouped in a window of a size that increases exponentially.
Parameters:
p - The transition matrix representing the backward
temporal correlations.
e - The privacy budget for data publishing.
i - The timestamp of interest.
w - The window size to group previous releases.
h - The exponentially increasing coefficient that affects
the window size.
Returns:
a - The backward privacy loss at the current timestamp
due to the previous data releases.
'''
def bpl_e(p, e, i, w, h):
if i - w**h > 1:
# print('bpl_e: %d - %d [%d]' %(i, i - w**h, w**h))
return priv_l(np.linalg.matrix_power(p, w**h), bpl_e(p, e, i - w**h, w, h + 1), e[i - 1])
elif i - w**h <= 1:
# print('bpl_e: %d - %d [%d]' %(i, 1, i - 1))
return priv_l(np.linalg.matrix_power(p, i - 1), e[0], e[i - 1])
elif i == 1:
# print('bpl_e: %d' %(1))
return e[0]
'''
Calculate the total backward privacy loss at the current
timestamp using the exponential model, i.e., previous releases
are grouped in a window of a size that increases exponentially,
using memoization.
Parameters:
p - The transition matrix representing the backward
temporal correlations.
e - The privacy budget for data publishing.
i - The timestamp of interest.
w - The window size to group previous releases.
h - The exponentially increasing coefficient that affects
the window size.
Returns:
a - The backward privacy loss at the current timestamp
due to the previous data releases.
'''
def bpl_e_m(p, e, i, w, h):
if i - w**h > 1:
return priv_l_m(np.linalg.matrix_power(p, w**h), bpl_e_m(p, e, i - w**h, w, h + 1), e[i - 1])
elif i - w**h <= 1:
return priv_l_m(np.linalg.matrix_power(p, i - 1), e[0], e[i - 1])
elif i == 1:
return e[0]
'''
Calculate the total forward privacy loss at the current
timestamp.
Parameters:
p - The transition matrix representing the forward
temporal correlations.
a - The forward privacy loss of the current release
at all next timestamps.
e - The privacy budget for data publishing.
t - The time limit.
Returns:
a - The forward privacy loss at the current timestamp
due to the next data releases.
'''
def fpl(p, a, e, t):
a[t - 1] = e[t - 1]
for i in range(t - 2, -1, -1):
a[i] = priv_l(p, a[i + 1], e[i])
return a
'''
Calculate the total forward privacy loss at the current
timestamp, using memoization.
Parameters:
p - The transition matrix representing the forward
temporal correlations.
a - The forward privacy loss of the current release
at all next timestamps.
e - The privacy budget for data publishing.
t - The time limit.
Returns:
a - The forward privacy loss at the current timestamp
due to the next data releases.
'''
def fpl_m(p, a, e, t):
a[t - 1] = e[t - 1]
for i in range(t - 2, -1, -1):
a[i] = priv_l_m(p, a[i + 1], e[i])
return a
def fpl_lmdk_mem(p, a, e, t, lmdk):
# t is (near) the landmark
if lmdk == t + 1 or t == lmdk:
return np.array([0]*len(a))
# There is no landmark after
if lmdk > len(e):
lmdk = len(e)
a[lmdk - 1] = e[lmdk - 1]
for i in range(lmdk - 2, t - 2, -1):
a[i] = priv_l_m(p, a[i + 1], e[i])
return a
'''
Calculate the total forward privacy loss at the current
timestamp using the static model, i.e., next releases
are grouped in a window of static size.
Parameters:
p - The transition matrix representing the forward
temporal correlations.
e - The privacy budget for data publishing.
i - The timestamp of interest.
w - The window size to group next releases.
Returns:
a - The forward privacy loss at the current timestamp
due to the next data releases.
'''
def fpl_s(p, e, i, t, w):
if i + w < t:
# print('fpl_s: %d - %d [%d]' %(i, i + w, w))
return priv_l(np.linalg.matrix_power(p, w), fpl_s(p, e, i + w, t, w), e[i - 1])
elif i + w >= t:
# print('fpl_s: %d - %d [%d]' %(i, t, t - i))
return priv_l(np.linalg.matrix_power(p, t - i), e[t - 1], e[i - 1])
elif i == t:
# print('fpl_s: %d' %(t))
return e[t - 1]
'''
Calculate the total forward privacy loss at the current
timestamp using the static model, i.e., next releases
are grouped in a window of static size, using memoization.
Parameters:
p - The transition matrix representing the forward
temporal correlations.
e - The privacy budget for data publishing.
i - The timestamp of interest.
w - The window size to group next releases.
Returns:
a - The forward privacy loss at the current timestamp
due to the next data releases.
'''
def fpl_s_m(p, e, i, t, w):
if i + w < t:
return priv_l_m(np.linalg.matrix_power(p, w), fpl_s_m(p, e, i + w, t, w), e[i - 1])
elif i + w >= t:
return priv_l_m(np.linalg.matrix_power(p, t - i), e[t - 1], e[i - 1])
elif i == t:
return e[t - 1]
'''
Calculate the total forward privacy loss at the current
timestamp using the linear model, i.e., next releases
are grouped in a window of a size that increases linearly.
Parameters:
p - The transition matrix representing the forward
temporal correlations.
e - The privacy budget for data publishing.
i - The timestamp of interest.
w - The window size to group next releases.
l - The linearly increasing coefficient that affects the
window size.
Returns:
a - The forward privacy loss at the current timestamp
due to the next data releases.
'''
def fpl_l(p, e, i, t, w, l):
if i + w*l < t:
# print('fpl_l: %d - %d [%d]' %(i, i + w*l, w*l))
return priv_l(np.linalg.matrix_power(p, w*l), fpl_l(p, e, i + w*l, t, w, l + 1), e[i - 1])
elif i + w*l >= t:
# print('fpl_l: %d - %d [%d]' %(i, t, t - i))
return priv_l(np.linalg.matrix_power(p, t - i), e[t - 1], e[i - 1])
elif i == t:
# print('fpl_l: %d' %(t))
return e[t - 1]
'''
Calculate the total forward privacy loss at the current
timestamp using the linear model, i.e., next releases
are grouped in a window of a size that increases linearly,
using memoization.
Parameters:
p - The transition matrix representing the forward
temporal correlations.
e - The privacy budget for data publishing.
i - The timestamp of interest.
w - The window size to group next releases.
l - The linearly increasing coefficient that affects the
window size.
Returns:
a - The forward privacy loss at the current timestamp
due to the next data releases.
'''
def fpl_l_m(p, e, i, t, w, l):
if i + w*l < t:
return priv_l_m(np.linalg.matrix_power(p, w*l), fpl_l_m(p, e, i + w*l, t, w, l + 1), e[i - 1])
elif i + w*l >= t:
return priv_l_m(np.linalg.matrix_power(p, t - i), e[t - 1], e[i - 1])
elif i == t:
return e[t - 1]
'''
Calculate the total forward privacy loss at the current
timestamp using the exponential model, i.e., next releases
are grouped in a window of a size that increases exponentially.
Parameters:
p - The transition matrix representing the forward
temporal correlations.
e - The privacy budget for data publishing.
i - The timestamp of interest.
w - The window size to group next releases.
h - The exponentially increasing coefficient that affects
the window size.
Returns:
a - The forward privacy loss at the current timestamp
due to the next data releases.
'''
def fpl_e(p, e, i, t, w, h):
if i + w**h < t:
# print('fpl_e: %d - %d [%d]' %(i, i + w**h, w**h))
return priv_l(np.linalg.matrix_power(p, w**h), fpl_e(p, e, i + w**h, t, w, h + 1), e[i - 1])
elif i + w**h >= t:
# print('fpl_e: %d - %d [%d]' %(i, t, t - i))
return priv_l(np.linalg.matrix_power(p, t - i), e[t - 1], e[i - 1])
elif i == t:
# print('fpl_e: %d' %(t))
return e[t - 1]
'''
Calculate the total forward privacy loss at the current
timestamp using the exponential model, i.e., next releases
are grouped in a window of a size that increases exponentially,
using memoization.
Parameters:
p - The transition matrix representing the forward
temporal correlations.
e - The privacy budget for data publishing.
i - The timestamp of interest.
w - The window size to group next releases.
h - The exponentially increasing coefficient that affects
the window size.
Returns:
a - The forward privacy loss at the current timestamp
due to the next data releases.
'''
def fpl_e_m(p, e, i, t, w, h):
if i + w**h < t:
return priv_l_m(np.linalg.matrix_power(p, w**h), fpl_e_m(p, e, i + w**h, t, w, h + 1), e[i - 1])
elif i + w**h >= t:
return priv_l_m(np.linalg.matrix_power(p, t - i), e[t - 1], e[i - 1])
elif i == t:
return e[t - 1]
'''
Calculate the total privacy loss at every timestamp.
Parameters:
bpl - The backward privacy loss.
fpl - The forward privacy loss.
e - The privacy budget for data publishing.
Returns:
The list of total privacy loss at every timestamp.
'''
def tpl(bpl, fpl, e):
return [x + y - z for (x, y, z) in zip(bpl, fpl, e)]
'''
Calculate the temporal privacy loss at every timestamp
taking into account landmarks.
Parameters:
e - The privacy budget for data publishing.
p_b - The transition matrix representing the backward
temporal correlations.
p_f - The transition matrix representing the forward
temporal correlations.
seq - The point sequence.
lmdks - The landmarks.
Returns:
a_b - The backward privacy loss at the current timestamp
due to the previous data releases.
a_f - The forward privacy loss at the current timestamp
due to the next data releases.
a - The total privacy loss at every timestamp
taking into account landmarks.
'''
def tpl_lmdk_mem(e, p_b, p_f, seq, lmdks):
a_b = np.zeros(len(seq))
a_f = np.zeros(len(seq))
a = np.zeros(len(seq))
for t in seq:
t_prv, t_nxt = get_limits(t, seq, lmdks)
a_b[t - 1] = bpl_lmdk_mem(p_b, np.zeros(max(seq)), e, t, t_prv)[t - 1]
a_f[t - 1] = fpl_lmdk_mem(p_f, np.zeros(max(seq)), e, t, t_nxt)[t - 1]
if a_b[t - 1] > 0 and a_f[t - 1] > 0:
a[t - 1] = a_b[t - 1] + a_f[t - 1] - e[t - 1]
elif a_b[t - 1] > 0 or a_f[t - 1] > 0:
a[t - 1] = a_b[t - 1] + a_f[t - 1]
else:
a[t - 1] = e[t - 1]
return a_b, a_f, a
'''
Get the limits for the calculation of temporal privacy loss.
Parameters:
t - The current timestamp.
seq - The point sequence.
lmdks - The landmarks.
Returns:
t_prv - The previous landmark.
t_nxt - The next landmark.
'''
def get_limits(t, seq, lmdks):
# Add landmark limits.
seq_lmdks = np.copy(lmdks)
# if seq[0] not in seq_lmdks:
# seq_lmdks = np.append(seq_lmdks, seq[0])
# if seq[len(seq) - 1] not in seq_lmdks:
# seq_lmdks = np.append(seq_lmdks, seq[len(seq) - 1])
# seq_lmdks = np.sort(seq_lmdks)
# Initialize previous and next landmarks.
t_prv = 0
t_nxt = len(seq) + 1
# Add current timestamp to landmarks.
seq_lmdks_tmp = np.copy(seq_lmdks)
if t not in seq_lmdks_tmp[:]:
seq_lmdks_tmp = np.append(seq_lmdks, t)
seq_lmdks_tmp = np.sort(seq_lmdks_tmp)
# Find the index of the current timestamp in the landmarks.
idx, = np.where(seq_lmdks_tmp == t)
i = idx[0]
# Find previous landmark.
if i != 0:
t_prv = seq_lmdks_tmp[i - 1]
# Find next landmark.
if i != len(seq_lmdks_tmp) - 1:
t_nxt = seq_lmdks_tmp[i + 1]
return t_prv, t_nxt
'''
Plots the privacy loss of the time series.
Parameters:
title - The title of the plot.
e - The privacy budget for data publishing.
a_b - The backward privacy loss.
a_f - The forward privacy loss.
a - The total privacy loss.
Returns:
Nothing.
'''
def plot_loss(title, e, a_b, a_f, a):
plt.rc('font', family='serif')
plt.rc('font', size=10)
plt.rc('text', usetex=True)
p = np.arange(1.0, len(e) + 1, 1)
plt.plot(p,
e,
label=r'$\varepsilon$',
color='gray',
marker='s')
plt.plot(p,
a_b,
label=r'$\alpha^B$',
color='#e52e4e', # red
marker='<')
plt.plot(p,
a_f,
label=r'$\alpha^F$',
color='#009874', # green
marker='>')
plt.plot(p,
a,
label=r'$\alpha$',
color='#6082b6', # blue
marker='d')
plt.axis([1.0, len(e),
0.0, 1.5*max(a)])
plt.legend(loc='best', frameon=True)
plt.grid(axis='y', alpha=1.0) # Add grid on y axis.
plt.xlabel('time') # Set x axis label.
plt.ylabel('privacy loss') # Set y axis label.
plt.title(title)
plt.show()
'''
Plots a comparison of the privacy loss of all models.
Parameters:
title - The title of the plot.
a - The privacy loss of the basic model.
a_s - The privacy loss of the static model.
a_e - The privacy loss of the exponential model.
a_l - The privacy loss of the linear model.
Returns:
Nothing.
'''
def cmp_loss(title, a, a_s, a_e, a_l):
plt.rc('font', family='serif')
plt.rc('font', size=10)
plt.rc('text', usetex=True)
p = np.arange(1.0, len(a) + 1, 1)
plt.plot(p,
a,
label=r'$\alpha$',
color='red',
marker='s')
plt.plot(p,
a_s,
label=r'$\alpha_s$',
color='green',
marker='>')
plt.plot(p,
a_e,
label=r'$\alpha_e$',
color='blue',
marker='d')
plt.plot(p,
a_l,
label=r'$\alpha_l$',
color='yellow',
marker='<')
plt.axis([1.0, len(a),
0.0, 1.5*max(a)])
plt.legend(loc='best', frameon=True)
plt.grid(axis='y', alpha=1.0) # Add grid on y axis.
plt.xlabel('time') # Set x axis label.
plt.ylabel('privacy loss') # Set y axis label.
plt.title(title)
plt.show()
'''
Parse arguments.
Mandatory:
model, The model to be used: (b)asic,
(s)tatic, (l)inear, (e)xponential.
Optional:
-c, --correlation, The correlation degree.
-d, --debug, Enable debugging mode.
-e, --epsilon, The available privacy budget.
-m, --matrix, The size of the transition matrix.
-o, --output, The path to the output directory.
-t, --time, The time limit.
-w, --window, The size of the event protection window.
'''
def parse_args():
# Create argument parser.
parser = argparse.ArgumentParser()
# Mandatory arguments.
parser.add_argument('model', help='The model to be used.', choices=['b', 's', 'l', 'e'], type=str)
# Optional arguments.
parser.add_argument('-c', '--correlation', help='The correlation degree.', type=float, default=0.1)
parser.add_argument('-d', '--debug', help='Enable debugging mode.', action='store_true')
parser.add_argument('-e', '--epsilon', help='The available privacy budget.', type=float, default=1.0)
parser.add_argument('-m', '--matrix', help='The matrix size.', type=int, default=2)
parser.add_argument('-o', '--output', help='The path to the output directory.', type=str, default='')
parser.add_argument('-t', '--time', help='The time limit.', type=int, default=1)
parser.add_argument('-w', '--window', help='The size of the event protection window.', type=int, default=1)
# Parse arguments.
args = parser.parse_args()
return args
def main(args):
global DEBUG, MISS, TOTAL
DEBUG = args.debug
MISS, TOTAL = 0, 0
e = [args.epsilon]*args.time
# Generate transition matrices.
p_b = gen_trans_mt(args.matrix, args.correlation)
p_f = gen_trans_mt(args.matrix, args.correlation)
if DEBUG:
# Run basic for comparison.
a_b = bpl(p_b, [0.0] * args.time, e, args.time)
a_f = fpl(p_f, [0.0] * args.time, e, args.time)
a = tpl(a_b, a_f, e)
output_path = args.output +\
'/' + args.model +\
'_c' + str(args.correlation) +\
'_e' + str(args.epsilon) +\
'_m' + str(args.matrix) +\
'_t' + str(args.time) +\
'_w' + str(args.window)
if args.model == 'b':
# Basic
# start_time = time.process_time()
a_b = bpl_m(p_b, [0.0] * args.time, e, args.time)
a_f = fpl_m(p_f, [0.0] * args.time, e, args.time)
a = tpl(a_b, a_f, e)
# duration1 = time.process_time() - start_time
if DEBUG:
print('\Basic\n'
't\tbpl\t\tfpl\t\ttpl')
for i in range(0, args.time):
print('%d\t%f\t%f\t%f' %(i + 1, a_b[i], a_f[i], a[i]))
print('\nSUM\t%f\t%f\t%f' %(sum(a_b), sum(a_f), sum(a)))
print('AVG\t%f\t%f\t%f' %(sum(a_b)/args.time, sum(a_f)/args.time, sum(a)/args.time))
plot_loss('Basic', e, a_b, a_f, a)
if args.Returns:
save_output(output_path, args.time, e, a_b, a_f, a)
# start_time = time.process_time()
# a_b_m = bpl_m(p_b, [0.0] * args.time, e, args.time)
# a_f_m = fpl_m(p_f, [0.0] * args.time, e, args.time)
# a_m = tpl(a_b_m, a_f_m, e)
# duration2 = time.process_time() - start_time
# diff = ((duration2 - duration1)/duration1)*100
# success = safe_div((TOTAL - MISS)*100, TOTAL)
# print('##############################\n'
# 'basic : %.4fs\n'
# 'w/ mem: %.4fs\n'
# '- succ: %d/%d (%.2f%%)\n'
# 'diff : %.4fs (%.2f%%)\n'
# 'OK : %r'
# %(duration1,
# duration2,
# TOTAL - MISS, TOTAL, success,
# duration2 - duration1, diff,
# np.array_equal(a, a_m)))
# MISS, TOTAL = 0, 0
elif args.model == 's':
# Static
# start_time = time.process_time()
a_s_b, a_s_f = [None]*args.time, [None]*args.time
for i in range(1, args.time + 1):
a_s_b[i - 1] = bpl_s_m(p_b, e, i, args.window)
a_s_f[i - 1] = fpl_s_m(p_f, e, i, args.time, args.window)
a_s = tpl(a_s_b, a_s_f, e)
# duration1 = time.process_time() - start_time
if DEBUG:
print('\nStatic\n'
't\tbpl\t\t\tfpl\t\t\ttpl')
for i in range(0, args.time):
print('%d\t%f (%.2f%%)\t%f (%.2f%%)\t%f (%.2f%%)'
%(i + 1,
a_s_b[i], (a_s_b[i] - a_b[i])/a_b[i]*100,
a_s_f[i], (a_s_f[i] - a_f[i])/a_f[i]*100,
a_s[i], (a_s[i] - a[i])/a[i]*100))
print('\nSUM\t%f (%.2f%%)\t%f (%.2f%%)\t%f (%.2f%%)'
%(sum(a_s_b), ((sum(a_s_b) - sum(a_b))/sum(a_b))*100,
sum(a_s_f), ((sum(a_s_f) - sum(a_f))/sum(a_f))*100,
sum(a_s), ((sum(a_s) - sum(a))/sum(a))*100))
print('AVG\t%f (%.2f%%)\t%f (%.2f%%)\t%f (%.2f%%)'
%(sum(a_s_b)/args.time, ((sum(a_s_b)/args.time - sum(a_b)/args.time)/(sum(a_b)/args.time))*100,
sum(a_s_f)/args.time, ((sum(a_s_f)/args.time - sum(a_f)/args.time)/(sum(a_f)/args.time))*100,
sum(a_s)/args.time, ((sum(a_s)/args.time - sum(a)/args.time)/(sum(a)/args.time))*100))
plot_loss('Static', e, a_s_b, a_s_f, a_s)
if args.Returns:
save_output(output_path, args.time, e, a_s_b, a_s_f, a_s)
# start_time = time.process_time()
# a_s_b_m, a_s_f_m = [None]*args.time, [None]*args.time
# for i in range(1, args.time + 1):
# a_s_b_m[i - 1] = bpl_s_m(p_b, e, i, args.window)
# a_s_f_m[i - 1] = fpl_s_m(p_f, e, i, args.time, args.window)
# a_s_m = tpl(a_s_b_m, a_s_f_m, e)
# duration2 = time.process_time() - start_time
# diff = ((duration2 - duration1)/duration1)*100
# success = safe_div((TOTAL - MISS)*100, TOTAL)
# print('##############################\n'
# 'static: %.4fs\n'
# 'w/ mem: %.4fs\n'
# '- succ: %d/%d (%.2f%%)\n'
# 'diff : %.4fs (%.2f%%)\n'
# 'OK : %r'
# %(duration1,
# duration2,
# TOTAL - MISS, TOTAL, success,
# duration2 - duration1, diff,
# np.array_equal(a_s, a_s_m)))
# MISS, TOTAL = 0, 0
elif args.model == 'l':
# Linear
l = 1
# start_time = time.process_time()
a_l_b, a_l_f = [None]*args.time, [None]*args.time
for i in range(1, args.time + 1):
a_l_b[i - 1] = bpl_l_m(p_b, e, i, args.window, l)
a_l_f[i - 1] = fpl_l_m(p_f, e, i, args.time, args.window, l)
a_l = tpl(a_l_b, a_l_f, e)
# duration1 = time.process_time() - start_time
if DEBUG:
print('\nLinear\n'
't\tbpl\t\t\tfpl\t\t\ttpl')
for i in range(0, args.time):
print('%d\t%f (%.2f%%)\t%f (%.2f%%)\t%f (%.2f%%)'
%(i + 1,
a_l_b[i], (a_l_b[i] - a_b[i])/a_b[i]*100,
a_l_f[i], (a_l_f[i] - a_f[i])/a_f[i]*100,
a_l[i], (a_l[i] - a[i])/a[i]*100))
print('\nSUM\t%f (%.2f%%)\t%f (%.2f%%)\t%f (%.2f%%)'
%(sum(a_l_b), ((sum(a_l_b) - sum(a_b))/sum(a_b))*100,
sum(a_l_f), ((sum(a_l_f) - sum(a_f))/sum(a_f))*100,
sum(a_l), ((sum(a_l) - sum(a))/sum(a))*100))
print('AVG\t%f (%.2f%%)\t%f (%.2f%%)\t%f (%.2f%%)'
%(sum(a_l_b)/args.time, ((sum(a_l_b)/args.time - sum(a_b)/args.time)/(sum(a_b)/args.time))*100,
sum(a_l_f)/args.time, ((sum(a_l_f)/args.time - sum(a_f)/args.time)/(sum(a_f)/args.time))*100,
sum(a_l)/args.time, ((sum(a_l)/args.time - sum(a)/args.time)/(sum(a)/args.time))*100))
plot_loss('Linear', e, a_l_b, a_l_f, a_l)
if args.Returns:
save_output(output_path, args.time, e, a_l_b, a_l_f, a_l)
# start_time = time.process_time()
# a_l_b_m, a_l_f_m = [None]*args.time, [None]*args.time
# for i in range(1, args.time + 1):
# a_l_b_m[i - 1] = bpl_l_m(p_b, e, i, args.window, l)
# a_l_f_m[i - 1] = fpl_l_m(p_f, e, i, args.time, args.window, l)
# a_l_m = tpl(a_l_b_m, a_l_f_m, e)
# duration2 = time.process_time() - start_time
# diff = ((duration2 - duration1)/duration1)*100
# success = safe_div((TOTAL - MISS)*100, TOTAL)
# print('##############################\n'
# 'linear: %.4fs\n'
# 'w/ mem: %.4fs\n'
# '- succ: %d/%d (%.2f%%)\n'
# 'diff : %.4fs (%.2f%%)\n'
# 'OK : %r'
# %(duration1,
# duration2,
# TOTAL - MISS, TOTAL, success,
# duration2 - duration1, diff,
# np.array_equal(a_l, a_l_m)))
# MISS, TOTAL = 0, 0
elif args.model == 'e':
# Exponential
h = 1
# start_time = time.process_time()
a_e_b, a_e_f = [None]*args.time, [None]*args.time
for i in range(1, args.time + 1):
a_e_b[i - 1] = bpl_e_m(p_b, e, i, args.window, h)
a_e_f[i - 1] = fpl_e_m(p_f, e, i, args.time, args.window, h)
a_e = tpl(a_e_b, a_e_f, e)
# duration1 = time.process_time() - start_time
if DEBUG:
print('\nExponential\n'
't\tbpl\t\t\tfpl\t\t\ttpl')
for i in range(0, args.time):
print('%d\t%f (%.2f%%)\t%f (%.2f%%)\t%f (%.2f%%)'
%(i + 1,
a_e_b[i], (a_e_b[i] - a_b[i])/a_b[i]*100,
a_e_f[i], (a_e_f[i] - a_f[i])/a_f[i]*100,
a_e[i], (a_e[i] - a[i])/a[i]*100))
print('\nSUM\t%f (%.2f%%)\t%f (%.2f%%)\t%f (%.2f%%)'
%(sum(a_e_b), ((sum(a_e_b) - sum(a_b))/sum(a_b))*100,
sum(a_e_f), ((sum(a_e_f) - sum(a_f))/sum(a_f))*100,
sum(a_e), ((sum(a_e) - sum(a))/sum(a))*100))
print('AVG\t%f (%.2f%%)\t%f (%.2f%%)\t%f (%.2f%%)'
%(sum(a_e_b)/args.time, ((sum(a_e_b)/args.time - sum(a_b)/args.time)/(sum(a_b)/args.time))*100,
sum(a_e_f)/args.time, ((sum(a_e_f)/args.time - sum(a_f)/args.time)/(sum(a_f)/args.time))*100,
sum(a_e)/args.time, ((sum(a_e)/args.time - sum(a)/args.time)/(sum(a)/args.time))*100))
plot_loss('Exponential', e, a_e_b, a_e_f, a_e)
if args.Returns:
save_output(output_path, args.time, e, a_e_b, a_e_f, a_e)
# start_time = time.process_time()
# a_e_b_m, a_e_f_m = [None]*args.time, [None]*args.time
# for i in range(1, args.time + 1):
# a_e_b_m[i - 1] = bpl_e_m(p_b, e, i, args.window, h)
# a_e_f_m[i - 1] = fpl_e_m(p_f, e, i, args.time, args.window, h)
# a_e_m = tpl(a_e_b_m, a_e_f_m, e)
# duration2 = time.process_time() - start_time
# diff = ((duration2 - duration1)/duration1)*100
# success = safe_div((TOTAL - MISS)*100, TOTAL)
# print('##############################\n'
# 'exp : %.4fs\n'
# 'w/ mem: %.4fs\n'
# '- succ: %d/%d (%.2f%%)\n'
# 'diff : %.4fs (%.2f%%)\n'
# 'OK : %r'
# %(duration1,
# duration2,
# TOTAL - MISS, TOTAL, success,
# duration2 - duration1, diff,
# np.array_equal(a_e, a_e_m)))
# MISS, TOTAL = 0, 0
# if DEBUG:
# cmp_loss('Backward', a_b, a_s_b, a_l_b, a_e_b)
# cmp_loss('Forward', a_f, a_s_f, a_l_f, a_e_f)
# cmp_loss('Total', a, a_s, a_l, a_e)
if __name__ == '__main__':
try:
args = parse_args()
print('##############################\n'
'Correlation: %f\n'
'Debug : %r\n'
'Epsilon : %f\n'
'Matrix : %d\n'
'Model : %s\n'
'Output : %s\n'
'Time : %d\n'
'Window : %d\n'
'##############################'
%(args.correlation,
args.debug,
args.epsilon,
args.matrix,
args.model,
args.output,
args.time,
args.window), flush=True)
start_time = time.process_time()
main(args)
end_time = time.process_time()
print('##############################\n'
'Time : %.4fs\n'
'##############################\n'
%(end_time - start_time), flush=True)
except KeyboardInterrupt:
print('Interrupted by user.')
exit()