code: Added libraries

This commit is contained in:
Manos Katsomallos 2021-07-25 16:51:51 +03:00
parent 6ec58b9adc
commit 7dc347cf53
2 changed files with 1446 additions and 0 deletions

479
code/lib/lmdk_bgt.py Normal file
View File

@ -0,0 +1,479 @@
#!/usr/bin/env python3
from geopy.distance import distance
import math
import numpy as np
import random
import sympy as sp
import time
import lmdk_lib
from matplotlib import pyplot as plt
def draw_bgts(title, bgts):
'''
Plots the allocated budget.
Parameters:
title - The title of the plot.
bgts - The allocated privacy budget.
Returns:
Nothing.
'''
lmdk_lib.plot_init()
p = np.arange(1, len(bgts) + 1, 1)
plt.plot(
p,
bgts,
linewidth=lmdk_lib.line_width,
markersize=lmdk_lib.marker_size,
markeredgewidth=0,
label=r'$\varepsilon$',
marker='s'
)
lmdk_lib.plot_legend()
# Set plot box
plt.axis([1, len(bgts), .0, max(bgts) + max(bgts)/2])
# Set x axis label
plt.xlabel('Timestamp')
# Set y axis label
plt.ylabel('Privacy budget')
plt.title(title.replace('_', '-'))
plt.show()
def validate_bgts(seq, lmdks, epsilon, bgts):
'''
Budget allocation validation.
Parameters:
seq - The point sequence.
lmdks - The landmarks.
epsilon - The available privacy budget.
bgts - The privacy budget allocation.
Returns:
The index of the privacy budget that caused the failure or -1 if successful.
'''
bgts_sum = .0
# Landmarks
for i, p in enumerate(seq):
if np.any(lmdks[:] == p):
bgts_sum += bgts[i]
# Regular events
bgts_max = bgts_sum
for i, p in enumerate(seq):
if not np.any(lmdks[:] == p):
bgts_cur = bgts_sum + bgts[i]
if bgts_cur > bgts_max:
bgts_max = bgts_cur
if bgts_cur > epsilon and not math.isclose(bgts_cur, epsilon, rel_tol=.001):
print('Budget allocation validation failed: %.2f%% (%.4f/%.4f)' %(100*bgts_max/epsilon, bgts_max, epsilon))
return i
print(
'Landmark budget allocation: %.2f%% (%.4f/%.4f)\n'
'Overall budget allocation : %.2f%% (%.4f/%.4f)\n'
%(100*bgts_max/epsilon, bgts_max, epsilon,
100*sum(bgts)/epsilon, sum(bgts), epsilon,))
return -1
def uniform(seq, lmdks, epsilon):
'''
Uniform budget allocation.
Parameters:
seq - The point sequence.
lmdks - The landmarks.
epsilon - The available privacy budget.
Returns:
bgts - The privacy budget allocation.
'''
bgts = np.zeros(len(seq))
# Allocate enough budget for all landmarks and one regular point
k = len(lmdks) + 1
# All points are landmarks
if k > len(seq):
k = len(lmdks)
# Allocate the budget
for i, _ in enumerate(bgts):
bgts[i] = epsilon/k
return bgts
def geometric(seq, lmdks, epsilon):
'''
Geometric budget allocation.
Parameters:
seq - The point sequence.
lmdks - The landmarks.
epsilon - The available privacy budget.
Returns:
bgts - The privacy budget distribution.
'''
bgts = np.zeros([len(seq)])
epsilon_avail = epsilon
# Landmarks
for i, p in enumerate(seq):
if np.any(lmdks[:] == p):
bgts[i] = epsilon_avail/2
epsilon_avail = epsilon_avail - epsilon_avail/2
# Regular events
for i, p in enumerate(seq):
if not np.any(lmdks[:] == p):
bgts[i] = epsilon_avail
return bgts
def exponential(seq, lmdks, epsilon):
'''
Exponential uniform budget allocation (max to user-level).
Parameters:
seq - The point sequence.
lmdks - The landmarks.
epsilon - The available privacy budget.
Returns:
bgts - The privacy budget allocation.
'''
# Fallback to uniform if zero or max landmarks
if len(lmdks) == 0 or len(lmdks) == len(seq):
return uniform(seq, lmdks, epsilon)
# # In case of seq == lmdks
# l = 0
# N = epsilon/len(seq)
# Otherwise
bgts = np.zeros([len(seq)])
if len(seq) != len(lmdks):
# Find worst case regural point
p_sel = 0
for p in seq:
if not np.any(lmdks[:] == p):
p_sel = p
break
# List all landmark timestamps with the worst regular
points = np.append(lmdks, [p_sel])
points = np.sort(points)
# epsilon_t = N*e^-l*t
l = sp.symbols('l', real=True)
# Cumulative allocation at landmarks and one extra point, i.e., L union {t}
T = seq.max()
# Bounding the privacy budgets at L union {t}
# epsilon = sum(N*e^-l*t) for t in L union {t}
t = sp.symbols('t')
eq = 0
for t in points:
eq += (((epsilon/len(seq))/(sp.exp(-1*l*T)))*sp.exp(-1*l*t))
try:
l = sp.solve(eq - epsilon, simplify=False, rational=False)[0]
# l = sp.solveset(eq <= epsilon, l, sp.S.Reals).sup
except Exception:
return bgts
# Allocate to the last point T epsilon/len(seq), i.e., user-level
N = (epsilon/len(seq))/sp.exp(-1*l*T)
# Allocate the budget
for i, t in enumerate(seq):
bgts[i] = N*sp.exp(-1*l*t)
return bgts
def linear_zero(seq, lmdks, epsilon):
'''
Linear uniform budget allocation (max to zero).
Parameters:
seq - The point sequence.
lmdks - The landmarks.
epsilon - The available privacy budget.
Returns:
bgts - The privacy budget allocation.
'''
bgts = np.zeros([len(seq)])
# Find worst case regural point
p_sel = 0
for p in seq:
if not np.any(lmdks[:] == p):
p_sel = p
break
# Sum all landmark timestamps with the worst regular
sum_cur = p_sel
for p in lmdks:
sum_cur += p
# epsilon_t = a*t + b
b = sp.symbols('b')
# Cumulative allocation at landmarks and one extra point
T = seq[len(seq) - 1]
b = sp.solve(b - ((b/T)*sum_cur + epsilon)/(len(lmdks) + 1))[0]
# Allocate to the last point 0
a = -b/T
# Allocate the budget
for i, t in enumerate(seq):
bgts[i] = a*t + b
return bgts
def linear(seq, lmdks, epsilon):
'''
Linear uniform budget allocation (max to user-level).
Parameters:
seq - The point sequence.
lmdks - The landmarks.
epsilon - The available privacy budget.
Returns:
bgts - The privacy budget allocation.
'''
# Fallback to uniform if zero or max landmarks
if len(lmdks) == 0 or len(lmdks) == len(seq):
return uniform(seq, lmdks, epsilon)
# Find worst case regural point
p_sel = 0
for p in seq:
if not np.any(lmdks[:] == p):
p_sel = p
break
# Sum all landmark timestamps with the worst regular
sum_cur = p_sel + np.sum(lmdks)
# epsilon_t = a*t + b
b = sp.symbols('b', real=True)
# Cumulative allocation at landmarks and one extra point, i.e., L union {t}
T = seq.max()
# Number of points to take into account
k = len(lmdks) + 1
# if len(lmdks) == len(seq):
# k = len(lmdks)
# Bounding the privacy budgets at L union {t}
# epsilon = a*sum(L union {t}) + |L union {t}|*b
b = sp.solve(b + (((epsilon/len(seq) - b)/T)*sum_cur - epsilon)/k, simplify=False, rational=False)[0]
# Allocate to the last point T epsilon/len(seq), i.e., user-level
a = (epsilon/len(seq) - b)/T
# Allocate the budget
bgts = np.zeros([len(seq)])
for i, t in enumerate(seq):
bgts[i] = a*t + b
return bgts
def plot_line(x_i, arr, label, marker, line):
plt.plot(x_i,
arr,
label=label,
color='black',
marker=marker,
linestyle=line,
# linewidth=1.0,
markerfacecolor='none')
def stepped(seq, lmdks, epsilon):
'''
Stepped budget allocation.
Parameters:
seq - The point sequence.
lmdks - The landmarks.
epsilon - The available privacy budget.
Returns:
bgts - The privacy budget allocation.
'''
bgts = np.zeros([len(seq)])
epsilon_avail = epsilon/2
for i, p in enumerate(seq):
# Reduce the available budget when a landmark is reached
if np.any(lmdks[:] == p):
epsilon_avail = epsilon_avail/2
bgts[i] = epsilon_avail
return bgts
def adaptive(seq, lmdks, epsilon):
'''
Adaptive budget allocation.
Parameters:
seq - The point sequence.
lmdks - The landmarks.
epsilon - The available privacy budget.
Returns:
rls_data - The perturbed data.
bgts - The privacy budget allocation.
skipped - The number of skipped releases.
'''
# Uniform budget allocation
bgts = uniform(seq, lmdks, epsilon)
# Released
rls_data = [None]*len(seq)
# The sampling rate
samp_rt = 1
# Track landmarks
lmdk_cur = 0
# Track skipped releases
skipped = 0
for i, p in enumerate(seq):
# Check if current point is a landmark
if lmdk_lib.is_landmark(p, lmdks):
lmdk_cur += 1
# Get coordinates
loc = (p[1], p[2])
if lmdk_lib.should_sample(samp_rt) or i == 0:
# Add noise to original data
new_loc = lmdk_lib.add_polar_noise(loc, bgts[i])
rls_data[i] = [p[0], new_loc[0], new_loc[1], p[3]]
# Adjust sampling rate
if i > 0:
if distance((rls_data[i - 1][1], rls_data[i - 1][2]), new_loc).km*1000 < 1/bgts[i]:
# Decrease
# samp_rt -= samp_rt*.9
# samp_rt -= samp_rt*.75
samp_rt -= samp_rt*.5
# samp_rt -= samp_rt*.25
# samp_rt -= samp_rt*.1
else:
# Increase
# samp_rt += (1 - samp_rt)*.9
# samp_rt += (1 - samp_rt)*.75
samp_rt += (1 - samp_rt)*.5
# samp_rt += (1 - samp_rt)*.25
# samp_rt += (1 - samp_rt)*.1
else:
skipped += 1
# Skip current release and approximate with previous
rls_data[i] = rls_data[i - 1]
if lmdk_lib.is_landmark(p, lmdks):
# Allocate the current budget to the following releases uniformly
for j in range(i + 1, len(seq)):
bgts[j] += bgts[i]/(len(lmdks) - lmdk_cur + 1)
# No budget was spent
bgts[i] = 0
return rls_data, bgts, skipped
def skip(seq, lmdks, epsilon):
'''
Skip landmarks.
Parameters:
seq - The point sequence.
lmdks - The landmarks.
epsilon - The available privacy budget.
Returns:
rls_data - The perturbed data.
bgts - The privacy budget allocation.
'''
# Event-level budget allocation
bgts = np.array(len(seq)*[epsilon])
# Released
rls_data = [None]*len(seq)
for i, p in enumerate(seq):
# Get coordinates
loc = (p[1], p[2])
# Add noise to original data
new_loc = lmdk_lib.add_polar_noise(loc, bgts[i])
# Check if current point is a landmark
if lmdk_lib.is_landmark(p, lmdks):
if i > 0:
# Approximate with previous location
new_loc = (rls_data[i - 1][1], rls_data[i - 1][2])
bgts[i] = 0
rls_data[i] = [p[0], new_loc[0], new_loc[1], p[3]]
return rls_data, bgts
def sample(seq, lmdks, epsilon):
'''
Publish randomly.
Parameters:
seq - The point sequence.
lmdks - The landmarks.
epsilon - The available privacy budget.
Returns:
rls_data - The perturbed data.
bgts - The privacy budget allocation.
skipped - The number of skipped releases.
'''
# Uniform budget allocation
bgts = uniform(seq, lmdks, epsilon)
# Released
rls_data = [None]*len(seq)
# The sampling rate
# (publish with 50% chance)
samp_rt = .5
# Track landmarks
lmdk_cur = 0
# Track skipped releases
skipped = 0
for i, p in enumerate(seq):
# Check if current point is a landmark
if lmdk_lib.is_landmark(p, lmdks):
lmdk_cur += 1
# Get coordinates
loc = (p[1], p[2])
if i == 0 or lmdk_lib.should_sample(samp_rt):
# Add noise to original data
new_loc = lmdk_lib.add_polar_noise(loc, bgts[i])
rls_data[i] = [p[0], new_loc[0], new_loc[1], p[3]]
else:
skipped += 1
# Skip current release and approximate with previous
rls_data[i] = rls_data[i - 1]
if lmdk_lib.is_landmark(p, lmdks):
# Allocate the current budget to the following releases uniformly
for j in range(i + 1, len(seq)):
bgts[j] += bgts[i]/(len(lmdks) - lmdk_cur + 1)
# No budget was spent
bgts[i] = 0
return rls_data, bgts, skipped
def uniform_r(seq, lmdks, epsilon):
# Released
rls_data = [None]*len(seq)
# Budgets
bgts = uniform(seq, lmdks, epsilon)
for i, p in enumerate(seq):
# Get coordinates
loc = (p[1], p[2])
# Add noise to original data
new_loc = lmdk_lib.add_polar_noise(loc, bgts[i])
rls_data[i] = [p[0], new_loc[0], new_loc[1], p[3]]
return rls_data, bgts
def utility_analysis(seq, lmdks, o, epsilon):
'''
Analyze the utility.
Parameters:
seq - The point sequence.
lmdks - The landmarks.
o - The perturbed data.
epsilon - The available privacy budget.
Returns:
Nothing.
'''
# Estimate Mean Absolute Error
mae = 0
# Compare with uniform
mae_u = 0
bgts_u = uniform(seq, lmdks, epsilon)
for i, p in enumerate(seq):
mae += distance((p[1], p[2]), (o[i][1], o[i][2])).km*1000/len(seq)
new_loc = lmdk_lib.add_polar_noise((p[1], p[2]), bgts_u[i])
mae_u += distance((p[1], p[2]), (new_loc[0], new_loc[1])).km*1000/len(seq)
print(
'\n########## Analysis ##########\n'
'MAE uniform : %.2fm\n'
'MAE current : %.2fm\n'
'Difference : %.2f%%\n'
'##############################\n'
%(mae_u, mae, 100*(mae - mae_u)/mae_u)
)
def mae(seq, o):
mae = 0
for i, p in enumerate(seq):
mae += distance((p[1], p[2]), (o[i][1], o[i][2])).km*1000/len(seq)
return mae

967
code/lib/lmdk_lib.py Normal file
View File

@ -0,0 +1,967 @@
#!/usr/bin/env python3
from datetime import datetime
from geopy.distance import distance
import heapq
import itertools
from scipy.special import lambertw
import math
import numpy as np
import os
import random
import scipy.stats as stats
import time
from matplotlib import pyplot as plt
import zipfile
# Plot globals
dpi = 300
font_size = 24.0
line_width = 2.0
marker_size = 14.0
tick_length = 8.0
def add_polar_noise(loc, epsilon):
'''
Add noise from planar Laplace.
[https://github.com/chatziko/location-guard/blob/master/src/js/common/laplace.js]
Parameters:
loc - The original location
loc - The original location
loc - The original location
epsilon - The privacy budget
Returns:
The perturbed location (lat, lng)
'''
# The bearing in radians.
# Random number in [0, 2*pi),
theta = np.random.uniform(low = 0, high = np.pi*2)
# The distance in meters.
# Use the inverse cumulative polar laplacian distribution function.
r = -np.real((lambertw((np.random.uniform(low = 0, high = 1) - 1)/np.e, k = -1) + 1)/epsilon)
new_loc = None
while not is_valid(new_loc):
new_loc = distance(kilometers = r/1000).destination(point = loc, bearing = np.degrees(theta))
return new_loc
def draw_line(line, x, label, marker):
axis_x = list(range(len(x)))
# plt.xticks(axis_x, x)
plt.plot(axis_x, x, label=label, marker=marker)
def eval_seq(dists):
'''
Calculate the standard deviation of a list of distances.
Parameters:
dists - A list of distances.
Returns:
The standard deviation of the distances.
'''
return np.std(dists)
def get_abs_dists(seq, comb, lmdks):
'''
Get the distances of the points in a combination from the start and end of the sequence.
Parameters:
seq - The point sequence.
comb - A possible point combination for a specified size.
lmdks - The landmarks.
Returns:
dists - The distances of the points in a combination.
'''
cur = np.append(comb, lmdks)
cur.sort()
start = seq[0]
end = seq[len(seq) - 1]
dists = np.zeros([len(cur)*2])
# Check if there are any points
if cur.size:
for i in range(0, len(cur)):
# From the start
dists[i*2] = abs(cur[i] - start)
# From the end
dists[i*2 + 1] = abs(cur[i] - end)
return dists
def get_combs(seq, size):
'''
Get all the possible landmark point combinations for a specified size.
Parameters:
seq - The point sequence.
size - The desired number of landmarks.
Returns:
All the possible landmark combinations for a specified size.
'''
return itertools.combinations(seq, size)
def get_combs_h(h, hist, max):
combs = []
for comb in itertools.product(range(h + 1), repeat=len(hist)):
check = True
for i, c in enumerate(comb):
if sum(comb) > max or c < hist[i] or c > h:
check = False
break
if check:
combs.append(comb)
return combs
def get_emd(h1, h2):
'''
The earth mover's distance (EMD), or Wasserstein distance, of two histograms.
[https://stats.stackexchange.com/a/151362]
Pele et al., The quadratic-chi histogram distance family
Parameters:
h1 - A histogram.
h2 - Another histogram.
Return:
The EMD of the histograms.
'''
return stats.wasserstein_distance(h1,h2)
# return stats.wasserstein_distance(get_norm_hist(h1), get_norm_hist(h2))
def get_hist(seq, lmdks):
'''
Create a histogram from a set of landmarks.
Parameters:
seq - The point sequence.
lmdks - The landmarks.
Return:
hist - The histogram of landmarks.
h - The bin size.
'''
# # Create a landmark sequence relative to the entire sequence
# lmdks_rel = np.array(lmdks)
# # Add the start of the sequence if not in landmarks
# if not np.any(lmdks_rel[:] == start):
# lmdks_rel = np.insert(lmdks_rel, 0, start)
# # Add the end of the sequence if not in landmarks
# if not np.any(lmdks_rel[:] == end):
# lmdks_rel = np.append(lmdks_rel, end)
# Dealing with zeros.
if len(seq) == 0 or len(lmdks) == 0:
return np.zeros(math.ceil(max(seq))), 1
# Interquartile range (IQR) is a measure of statistical dispersion, being equal to the difference between 75th and 25th percentiles, or between upper and lower quartiles.
# https://en.wikipedia.org/wiki/Interquartile_range
iqr = stats.iqr(lmdks)
# Use the FreedmanDiaconis rule to select the width of the bins to be used in the histogram.
# Robust (resilient to outliers) estimator that takes into account data variability and data size.
# https://en.wikipedia.org/wiki/Freedman%E2%80%93Diaconis_rule
h = 2*iqr*len(lmdks)**(-1/3)
if h < 1:
h = 1
# On the number of bins and width:
# https://en.wikipedia.org/wiki/Histogram#Number_of_bins_and_width
# Normalize the interval
h = math.ceil(max(seq)/math.ceil(max(seq)/math.ceil(h)))
# Create an empty histogram with intervals of size h
hist = np.zeros(math.ceil(max(seq)/h))
for lmdk in lmdks:
hist[int(lmdk/h) - 1] += 1
return hist, h
def get_lmdks(seq, size, dist):
'''
Get a landmark set for a distribution type.
Parameters:
seq - The point sequence.
size - The landmarks' size.
dist - The distribution type code.
-1 - Left-skewed
0 - Symmetric
+1 - Right-skewed
+2 - Bimodal
+3 - Uniform
Return:
hist - The histogram of landmarks.
h - The bin size.
'''
scale = len(seq)/10
if dist == -1 or dist == 0 or dist == 1:
return MixtureModel([truncnorm(min(seq), max(seq), get_loc(seq, dist), scale)]).sample(min(seq), max(seq), [], size)
elif dist == 2:
return MixtureModel([
truncnorm(min(seq), max(seq), get_loc(seq, -1), scale),
truncnorm(min(seq), max(seq), get_loc(seq, +1), scale)
]).sample(min(seq), max(seq), [], size)
else:
return np.sort(np.random.choice(np.arange(min(seq), max(seq) + 1, 1), size=size, replace=False))
def get_lmdks_rand(seq, size, skew):
'''
Get a landmark set with a given skewness.
Parameters:
seq - The point sequence.
size - The landmarks' size.
skew - The desired skewness.
Return:
hist - The histogram of landmarks.
h - The bin size.
'''
# Get all possible landmark combinations
lmdk_combs = get_combs(seq, size)
# Find a set of landmarks for the given requirements
lmdks = ()
for comb in lmdk_combs:
skew_cur = get_skew(comb)
# Check if skewness is close to the requirement or have the same sign
if math.isclose(skew_cur, skew, rel_tol=.1) or ((skew_cur < 0) == (skew < 0)):
lmdks = comb
break
return lmdks
def get_loc(seq, skew):
'''
Get the location of the sequence with a given skewness.
Parameters:
seq - The point sequence.
skew - The desired skewness.
Return:
The location.
'''
loc_min = min(seq)
loc_max = max(seq)
if skew < 0:
loc_min = (max(seq) - min(seq))/2
elif skew > 0:
loc_max = (max(seq) - min(seq))/2
return int(loc_min + (loc_max - loc_min)/2)
def get_mean(seq, hist, h):
'''
Get the mean of the histogram.
Parameters:
seq - The point sequence.
hist - The histogram of landmarks.
h - The bin size.
Return:
The mean of the histogram.
'''
sum = 0
for idx, count in enumerate(hist):
# Find bin limits
start = min(seq) + h*idx
end = min(seq) + h*(idx + 1)
if(end > max(seq)):
end = max(seq)
sum += (start + (end - start)/2)*count
return sum/np.sum(hist)
def get_norm(h1, h2):
'''
The Euclidean distance of two histograms.
[https://stackoverflow.com/a/1401828/13123075]
Parameters:
h1 - A histogram.
h2 - Another histogram.
Return:
The Euclidean distance of the histograms.
'''
return np.linalg.norm(h1 - h2)
def get_norm_hist(hist):
'''
Normalize the histogram.
Parameters:
hist - The original histogram.
Return:
The normalized histogram.
'''
# In-place type conversion
norm_hist = hist.astype(np.float32)
n = np.sum(norm_hist)
for i, a in enumerate(norm_hist):
if a:
norm_hist[i] = a/n
return norm_hist
def get_opts(seq, lmdks):
'''
Finds all the possible valid options.
Parameters:
seq - The point sequence.
lmdks - The landmarks.
Returns:
A list with all the possible options.
Requirements:
n = Regular points
r = The size of a combination
Time - O(C(n, r) + 2^C(n, r)), because O(n*C(n, r)) = O(n*n^min(r, n - r))
Space - O(r*C(n, r))
'''
reg = get_reg(seq, lmdks)
# Find all possible combinations for every k
combs = []
for k in range(len(lmdks) + 1, seq[len(seq) - 1] + 1):
combs.append(get_combs(reg, k - len(lmdks)))
# Find all possible options for all combinations
opts = itertools.product(*combs)
# Keep only valid options, i.e., larger sets must contain every element of every smaller set
rslts = []
for opt in opts:
for i, _ in enumerate(opt):
if i and not set(opt[i - 1]).issubset(opt[i]):
break
if i == len(opt) - 1:
rslts.append(opt)
return rslts
def get_reg(seq, lmdks):
'''
Get the regular points, i.e., non-landmarks, in a sequence.
Parameters:
seq - The point sequence.
lmdks - The landmarks.
Returns:
The non-landmark points in a sequence.
'''
return np.array([i for i in seq if i not in lmdks])
def get_rel_dists(seq, comb, lmdks):
'''
Get the distances of the points in a combination from the nearest point of reference.
That is, the previous/next point or the start/end if closer.
Parameters:
seq - The point sequence.
comb - A possible point combination for a specified size.
lmdks - The landmarks.
Returns:
dists - The distances of the points in a combination.
'''
# TODO: Review distances. Maybe take into account both ways?
cur = np.append(comb, lmdks)
cur.sort()
start = seq[0]
end = seq[len(seq) - 1]
dists = np.zeros([len(cur) + 1])
# Check if there are any points
if cur.size:
# First
dists[0] = abs(cur[0] - start)
# In-between
for i in range(0, len(cur) - 1):
dists[i + 1] = abs(cur[i] - cur[i + 1])
# Last
dists[len(cur)] = abs(cur[len(cur) - 1] - end)
return dists
def get_seq(start, end):
'''
Get a sequence of points.
Parameters:
start - The starting point of the sequence.
end - The ending point of the sequence.
Returns:
A point sequence.
'''
return np.arange(start, end + 1)
def get_shannon_ent(a):
'''
Get the Shannon entropy of an array.
It can be used to calculate the uniformity of a histogram.
Uniform probability yields maximum uncertainty and therefore maximum entropy.
Entropy, then, can only decrease from the value associated with uniform probability.
Parameters:
a - An array of integers
Return:
The Shannon entropy.
'''
# Keep only the non-zero elements of array a because log2(0) is -inf.
# The entropy of an event with zero probability is 0.
a = a[a != 0]
# Histograms are discrete probability distributions.
# We convert the counts to probabilities.
p_a = a/np.sum(a)
# Base 2 gives the unit of bits (or 'shannons').
return -np.sum(p_a*np.log2(p_a))
def get_shannon_ent_norm(a):
'''
Get the normalized Shannon entropy of an array.
It ranges from 0 (high entropy - close to uniform) to 1 (low entropy).
It ranges from 0 (high entropy - close to uniform) to 1 (low entropy).
It ranges from 0 (high entropy - close to uniform) to 1 (low entropy).
Parameters:
a - An array of integers
Return:
The normalized Shannon entropy.
'''
return 1 - get_shannon_ent(a)/np.log2(len(a))
def get_skew(lmdks):
'''
Fisher-Pearson coefficient of skewness.
negative - left (more data items right)
zero - symmetric data
positive - right (more data items left)
positive - right (more data items left)
positive - right (more data items left)
Parameters:
seq - The point sequence.
hist - The histogram of landmarks.
h - The bin size.
Return:
Fisher-Pearson coefficient of skewness.
'''
# def get_skew(seq, hist, h):
# return get_third(seq, hist, h)/get_std(seq, hist, h)**3
return stats.skew(lmdks)
def get_std(seq, hist, h):
'''
Get the standard deviation.
Parameters:
seq - The point sequence.
hist - The histogram of landmarks.
h - The bin size.
Return:
The standard deviation.
'''
hist_mean = get_mean(seq, hist, h)
sum = 0
for idx, count in enumerate(hist):
# Find bin limits
start = min(seq) + h*idx
end = min(seq) + h*(idx + 1)
if(end > max(seq)):
end = max(seq)
sum += (((start + (end - start)/2) - hist_mean)**2)*count
return sum/np.sum(hist)
def get_third(seq, hist, h):
'''
Get the third central moment.
Parameters:
seq - The point sequence.
hist - The histogram of landmarks.
h - The bin size.
Return:
The third central moment.
'''
hist_mean = get_mean(seq, hist, h)
sum = 0
for idx, count in enumerate(hist):
# Find bin limits
start = min(seq) + h*idx
end = min(seq) + h*(idx + 1)
if(end > max(seq)):
end = max(seq)
sum += (((start + (end - start)/2) - hist_mean)**3)*count
return sum/np.sum(hist)
def dist_type_to_str(d):
'''
Convert the distribution type code to string.
Parameters:
t - The distribution type code.
-1 - Left-skewed
0 - Symmetric
+1 - Right-skewed
+2 - Bimodal
+3 - Uniform
Return:
The distribution type.
'''
if d == -1:
return 'Left-skewed'
elif d == 0:
return 'Symmetric'
elif d == 1:
return 'Right-skewed'
elif d == 2:
return 'Bimodal'
elif d == 3:
return 'Uniform'
else:
return 'Undefined'
class MixtureModel(stats.rv_continuous):
'''
A mixture model of continuous probability distributions
[https://stackoverflow.com/a/47763145/13123075]
'''
def __init__(self, submodels, *args, **kwargs):
super().__init__(*args, **kwargs)
self.submodels = submodels
def _pdf(self, x):
pdf = self.submodels[0].pdf(x)
for submodel in self.submodels[1:]:
pdf += submodel.pdf(x)
pdf /= len(self.submodels)
return pdf
def rvs(self, size):
submodel_choices = np.random.randint(len(self.submodels), size=size)
submodel_samples = [submodel.rvs(size=size) for submodel in self.submodels]
rvs = np.choose(submodel_choices, submodel_samples)
return rvs
def sample(self, lower, upper, sample, size):
'''
Sample from the mixture model without replacement.
[https://stackoverflow.com/a/20548895/13123075]
Parameters:
self - The mixture model.
lower - The lower bound of the range.
upper - The upper bound of the range.
sample - Items to be excluded.
size - The sample size.
Returns:
A sample.
'''
w = self.pdf(range(int(lower), int(upper) + 1))
idx = []
for i in range(int(lower), int(upper) + 1):
if i not in sample:
idx.append(i)
elt = [(math.log(random.random()) / w[i - 1], i) for i in idx]
return np.sort([x[1] for x in heapq.nlargest(size, elt)])
def plot_init():
'''
Initialize the plot.
'''
# Reset
plt.close('all')
# Style
plt.style.use('classic')
# DPI
plt.figure(dpi=dpi)
# Font
plt.rc('font', family='sans-serif')
plt.rc('font', **{'sans-serif':['Liberation Sans']})
plt.rc('font', size=font_size)
# Grid
plt.setp(plt.figure().add_subplot(111).spines.values(), linewidth=line_width)
plt.grid(True, axis='y', linewidth=line_width)
# Ticks
plt.gca().tick_params(which='both', width=line_width)
plt.gca().tick_params(which='major', length=tick_length)
plt.gca().tick_params(which='minor', length=tick_length/2)
# Colors
plt.subplot(111).set_prop_cycle('color', ['#212121', '#616161', '#9e9e9e', '#bdbdbd', '#e0e0e0', 'f5f5f5'])
# Layout
plt.tight_layout()
def plot_legend():
'''
Initialize the plot legend.
'''
plt.legend(
loc='best',
fontsize=font_size,
numpoints=1,
borderpad=.2, # default: 0.4
labelspacing=.25, # default: 0.5
handlelength=2.0, # default: 2.0
handletextpad=.4, # default: 0.8
borderaxespad=.5, # default: 0.5
)
def save_plot(path):
'''
Save the plot to a file.
Parameters:
path - The desired location.
Returns:
Nothing.
'''
# Save plot
os.makedirs(os.path.dirname(path), exist_ok=True)
plt.savefig(path, bbox_inches='tight')
# Clean up
plt.clf()
plt.close()
def print_lmdks(seq, lmdks):
'''
Print the landmarks.
Parameters:
seq - The point sequence.
lmdks - The landmarks.
Returns:
Nothing.
'''
print('\n######### Landmarks ##########')
print(lmdks)
print('[', end='', flush=True)
for p in seq:
if np.any(lmdks[:] == p):
print('\'', end='', flush=True)
else:
print('.', end='', flush=True)
print(']', end='', flush=True)
print('\n##############################\n')
def plot_dist(seq, dist):
'''
Plot the probability distribution.
Parameters:
seq - The point sequence.
size - The landmarks' size.
dist - The distribution type code.
-1 - Left-skewed
0 - Symmetric
+1 - Right-skewed
+2 - Bimodal
+3 - Uniform
Returns:
Nothing.
'''
scale = len(seq)/10
p = stats.uniform.pdf(seq)
if dist == -1 or dist == 0 or dist == 1:
p = MixtureModel([truncnorm(min(seq), max(seq), get_loc(seq, dist), scale)]).pdf(seq)
elif dist == 2:
p = MixtureModel([
truncnorm(min(seq), max(seq), get_loc(seq, -1), scale),
truncnorm(min(seq), max(seq), get_loc(seq, +1), scale)
]).pdf(seq)
plt.plot(seq, p)
plt.show()
def simplify_data(seq, lmdks):
'''
Get synthetic from real data.
Parameters:
seq - The trajectory.
lmdks - The landmarks.
Returns:
The simplified sequence and landmarks.
'''
seq_s = get_seq(1, len(seq))
lmdks_s = []
for i, p in enumerate(seq):
if is_landmark(p, lmdks):
lmdks_s.append(i + 1)
return seq_s, np.array(lmdks_s)
def truncnorm(lower, upper, loc, scale):
'''
A truncated normal continuous random variable.
Parameters:
lower - The lower bound of the range.
upper - The upper bound of the range.
loc - The location of the distribution.
scale - The spread of the distribution.
Returns:
A truncated normal continuous random variable.
'''
return stats.truncnorm((lower - loc)/scale, (upper - loc)/scale, loc, scale)
def is_valid(coord):
'''
Check if coordinates are valid.
Parameters:
coord - The coordinates to be checked.
Returns:
True - Valid coordinates.
False - Invalid coordinates.
'''
if coord is None or (float(coord[0]) > 90 or float(coord[0]) < -90 # lat
or float(coord[1]) > 180 or float(coord[1]) < -180): #lng
return False
return True
def save_data(args, data, name):
'''
Save to file and compress.
Parameters:
args - The execution arguments.
data - The data.
name - The name.
Returns:
Nothing.
'''
try:
path = os.path.dirname(os.path.abspath(args.res)) + '/' + name
if 'lmdks' in name:
path += '(dis=' + str(args.dist) + ', per=' + str(args.per) + ')'
print('Saving to %s... ' %(path), end='', flush=True)
with open(path, 'wb') as file:
_, idx = np.unique(data, axis=0, return_index=True)
np.save(file, data[np.sort(idx)], allow_pickle=False)
print('[OK]')
# Compress file
print('Compressing in %s... ' %(args.res), end='', flush=True)
with zipfile.ZipFile(args.res, 'a') as zip_file:
zip_file.write(path, os.path.basename(path))
print('[OK]')
# Delete file
print('Deleting %s... ' %(path), end='', flush=True)
os.remove(path)
print('[OK]')
except Exception as e:
print('[Error: %s]' %(e))
def load_data(args, name):
'''
Load data from file.
Parameters:
args - The execution arguments.
data - The data.
name - The name.
Returns:
data - The data set.
0: uid, 1: lat, 2: lng, 3: tim
'''
data = np.array([])
try:
path = os.path.dirname(os.path.abspath(args.res)) + '/' + name
if 'lmdks' in name:
path += '(dis=' + str(args.dist) + ', per=' + str(args.per) + ')'
print('Loading %s... ' %(path), end='', flush=True)
with zipfile.ZipFile(args.res, 'r') as zip_file:
data = np.load(zip_file.open(os.path.basename(path)))
print('[OK]')
except Exception as e:
print('[Error: %s]' %(e))
return data
def find_lmdks(usrs_data, args):
'''
Find users' landmarks.
Parameters:
args - The execution arguments.
usrs_data - The users' data.
0: uid, 1: lat, 2: lng, 3: tim
Returns:
usrs_lmdks - The users' landmarks.
0: lid, 1: uid, 2: lat, 3: lng, 4: tim
'''
usrs_lmdks = np.empty((0,5), np.float32)
traj_cur = 0
lmdk_id = 0
usrs = np.unique(usrs_data[:,0])
for usr_i, usr in enumerate(usrs):
# Initialize user's landmarks list
lmdks = []
traj = usrs_data[usrs_data[:,0]==usr, :]
traj_cur += len(traj)
print(
'[%d%%] Points: %d/%d | Users: %d/%d... '
%((traj_cur/usrs_data.shape[0])*100, traj_cur, usrs_data.shape[0], usr, usrs[len(usrs) - 1]),
end='', flush=True
)
# Check the actual points
i = 0
while i < len(traj) - 1:
lmdk_cur = []
if is_valid((traj[i][1], traj[i][2])):
for j in range(i + 1, len(traj)):
if is_valid((traj[j][1], traj[j][2])):
# Add the beginning only the first time
if j == i + 1:
lmdk_cur.append([lmdk_id, usr, traj[i][1], traj[i][2], traj[i][3]])
# Add the new point
lmdk_cur.append([lmdk_id, usr, traj[j][1], traj[j][2], traj[j][3]])
# Distance in meters
dist = distance((traj[i][1], traj[i][2]), (traj[j][1], traj[j][2])).km*1000
# Distance exceeded or reached end of iteration
if dist > args.dist or j == len(traj) - 1:
# Period in minutes
per = abs(datetime.fromtimestamp(int(traj[i][3])) - datetime.fromtimestamp(int(traj[j][3]))).total_seconds()/60
# Check if enough time passed
if per > args.per:
# usrs_id starts from 1
lmdk_id += 1
# Assign id to current landmark
for l in lmdk_cur:
l[0] = lmdk_id
# Append current landmark
lmdks += lmdk_cur
# Continue checking from the current point
i = j
break
# No landmark was found, continue from next point
if i == 0 or not is_valid((traj[i][1], traj[i][2])) or 'j' not in vars() or i != j:
i += 1
print('[OK]')
if lmdks:
usrs_lmdks = np.append(usrs_lmdks, np.asarray(lmdks, dtype=np.float32), axis=0)
return usrs_lmdks
def lmdks_stats(args, usrs_lmdks):
'''
Generate landmarks' stats.
Parameters:
args - The execution arguments.
usrs_lmdks - The user' landmarks.
0: lid, 1: uid, 2: lat, 3: lng, 4: tim
Returns:
Nothing.
'''
lmdks_stats = {
'total': 0,
'len': 0,
'count': {},
}
# Check every user
usrs = np.unique(usrs_lmdks[:, 1])
for usr in usrs:
print(
'[%d%%] Calculating landmark stats for user %d/%d... '
%(usr*100/len(usrs), usr, np.max(usrs)),
end='', flush=True
)
# Check each user's landmarks
for lmdk in np.unique(usrs_lmdks[usrs_lmdks[:, 1] == usr, 0]):
lmdk_traj = usrs_lmdks[usrs_lmdks[:, 0] == lmdk, :]
lmdks_stats['total'] += 1
lmdks_stats['len'] += len(lmdk_traj)
if lmdks_stats['count'].get(len(lmdk_traj)) == None:
lmdks_stats['count'][len(lmdk_traj)] = 1
else:
lmdks_stats['count'][len(lmdk_traj)] += 1
print('[OK]')
hist_min, hist_max = min(lmdks_stats['count'], key=int, default=0), max(lmdks_stats['count'], key=int, default=0)
# Make histogram
x = []
for i in range(hist_min, hist_max + 1):
if lmdks_stats['count'].get(i) != None:
x.extend([i]*lmdks_stats['count'].get(i))
# Show stats
print(
'\n############ Stats ###########\n'
'Landmarks : %d\n'
' Length\n'
' Total : %d (%.2f%%)\n'
' Minimum : %d\n'
' Maximum : %d\n'
'##############################\n'
%(lmdks_stats['total'], lmdks_stats['len'], (lmdks_stats['len']/args.time)*100, min(lmdks_stats['count'], key=int, default=0), max(lmdks_stats['count'], key=int, default=0))
)
# # Initialize plot
# plot_init()
# # Set x axis
# plt.xlabel('Landmarks sequence length')
# plt.xticks(rotation='vertical')
# # The y axis
# plt.ylabel('Number of sequences')
# plt.yscale('log')
# Create histogram
# plt.hist(x, bins=(hist_max - hist_min))
# Show plot
# plt.show()
# Save plot
# save_plot(os.path.dirname(args.arc) + '/' + 'results' + '(dis=' + str(args.dist) + ', per=' + str(args.per) + ').pdf')
def should_sample(samp_rt):
'''
Randomly decide to release with noise.
Parameters:
samp_rt - The sampling rate (0, 1]
Returns:
True/False
'''
return random.choices(population=[True, False], weights=[samp_rt, 1 - samp_rt], k=1)[0]
def is_landmark(p, lmdks):
'''
Check is a point is a landmark.
Parameters:
p - The point to check.
lmdks - A 2d array of landmarks.
Returns:
True/False
'''
if len(lmdks) and any(np.equal(lmdks[:, 1:5], p).all(1)):
return True
return False