code: Added libraries
This commit is contained in:
parent
6ec58b9adc
commit
7dc347cf53
479
code/lib/lmdk_bgt.py
Normal file
479
code/lib/lmdk_bgt.py
Normal file
@ -0,0 +1,479 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
from geopy.distance import distance
|
||||||
|
import math
|
||||||
|
import numpy as np
|
||||||
|
import random
|
||||||
|
import sympy as sp
|
||||||
|
import time
|
||||||
|
import lmdk_lib
|
||||||
|
from matplotlib import pyplot as plt
|
||||||
|
|
||||||
|
|
||||||
|
def draw_bgts(title, bgts):
|
||||||
|
'''
|
||||||
|
Plots the allocated budget.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
title - The title of the plot.
|
||||||
|
bgts - The allocated privacy budget.
|
||||||
|
Returns:
|
||||||
|
Nothing.
|
||||||
|
'''
|
||||||
|
lmdk_lib.plot_init()
|
||||||
|
p = np.arange(1, len(bgts) + 1, 1)
|
||||||
|
plt.plot(
|
||||||
|
p,
|
||||||
|
bgts,
|
||||||
|
linewidth=lmdk_lib.line_width,
|
||||||
|
markersize=lmdk_lib.marker_size,
|
||||||
|
markeredgewidth=0,
|
||||||
|
label=r'$\varepsilon$',
|
||||||
|
marker='s'
|
||||||
|
)
|
||||||
|
lmdk_lib.plot_legend()
|
||||||
|
# Set plot box
|
||||||
|
plt.axis([1, len(bgts), .0, max(bgts) + max(bgts)/2])
|
||||||
|
# Set x axis label
|
||||||
|
plt.xlabel('Timestamp')
|
||||||
|
# Set y axis label
|
||||||
|
plt.ylabel('Privacy budget')
|
||||||
|
plt.title(title.replace('_', '-'))
|
||||||
|
plt.show()
|
||||||
|
|
||||||
|
|
||||||
|
def validate_bgts(seq, lmdks, epsilon, bgts):
|
||||||
|
'''
|
||||||
|
Budget allocation validation.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
seq - The point sequence.
|
||||||
|
lmdks - The landmarks.
|
||||||
|
epsilon - The available privacy budget.
|
||||||
|
bgts - The privacy budget allocation.
|
||||||
|
Returns:
|
||||||
|
The index of the privacy budget that caused the failure or -1 if successful.
|
||||||
|
'''
|
||||||
|
bgts_sum = .0
|
||||||
|
# Landmarks
|
||||||
|
for i, p in enumerate(seq):
|
||||||
|
if np.any(lmdks[:] == p):
|
||||||
|
bgts_sum += bgts[i]
|
||||||
|
# Regular events
|
||||||
|
bgts_max = bgts_sum
|
||||||
|
for i, p in enumerate(seq):
|
||||||
|
if not np.any(lmdks[:] == p):
|
||||||
|
bgts_cur = bgts_sum + bgts[i]
|
||||||
|
if bgts_cur > bgts_max:
|
||||||
|
bgts_max = bgts_cur
|
||||||
|
if bgts_cur > epsilon and not math.isclose(bgts_cur, epsilon, rel_tol=.001):
|
||||||
|
print('Budget allocation validation failed: %.2f%% (%.4f/%.4f)' %(100*bgts_max/epsilon, bgts_max, epsilon))
|
||||||
|
return i
|
||||||
|
print(
|
||||||
|
'Landmark budget allocation: %.2f%% (%.4f/%.4f)\n'
|
||||||
|
'Overall budget allocation : %.2f%% (%.4f/%.4f)\n'
|
||||||
|
%(100*bgts_max/epsilon, bgts_max, epsilon,
|
||||||
|
100*sum(bgts)/epsilon, sum(bgts), epsilon,))
|
||||||
|
return -1
|
||||||
|
|
||||||
|
|
||||||
|
def uniform(seq, lmdks, epsilon):
|
||||||
|
'''
|
||||||
|
Uniform budget allocation.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
seq - The point sequence.
|
||||||
|
lmdks - The landmarks.
|
||||||
|
epsilon - The available privacy budget.
|
||||||
|
Returns:
|
||||||
|
bgts - The privacy budget allocation.
|
||||||
|
'''
|
||||||
|
bgts = np.zeros(len(seq))
|
||||||
|
# Allocate enough budget for all landmarks and one regular point
|
||||||
|
k = len(lmdks) + 1
|
||||||
|
# All points are landmarks
|
||||||
|
if k > len(seq):
|
||||||
|
k = len(lmdks)
|
||||||
|
# Allocate the budget
|
||||||
|
for i, _ in enumerate(bgts):
|
||||||
|
bgts[i] = epsilon/k
|
||||||
|
return bgts
|
||||||
|
|
||||||
|
|
||||||
|
def geometric(seq, lmdks, epsilon):
|
||||||
|
'''
|
||||||
|
Geometric budget allocation.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
seq - The point sequence.
|
||||||
|
lmdks - The landmarks.
|
||||||
|
epsilon - The available privacy budget.
|
||||||
|
Returns:
|
||||||
|
bgts - The privacy budget distribution.
|
||||||
|
'''
|
||||||
|
bgts = np.zeros([len(seq)])
|
||||||
|
epsilon_avail = epsilon
|
||||||
|
# Landmarks
|
||||||
|
for i, p in enumerate(seq):
|
||||||
|
if np.any(lmdks[:] == p):
|
||||||
|
bgts[i] = epsilon_avail/2
|
||||||
|
epsilon_avail = epsilon_avail - epsilon_avail/2
|
||||||
|
# Regular events
|
||||||
|
for i, p in enumerate(seq):
|
||||||
|
if not np.any(lmdks[:] == p):
|
||||||
|
bgts[i] = epsilon_avail
|
||||||
|
return bgts
|
||||||
|
|
||||||
|
|
||||||
|
def exponential(seq, lmdks, epsilon):
|
||||||
|
'''
|
||||||
|
Exponential uniform budget allocation (max to user-level).
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
seq - The point sequence.
|
||||||
|
lmdks - The landmarks.
|
||||||
|
epsilon - The available privacy budget.
|
||||||
|
Returns:
|
||||||
|
bgts - The privacy budget allocation.
|
||||||
|
'''
|
||||||
|
# Fallback to uniform if zero or max landmarks
|
||||||
|
if len(lmdks) == 0 or len(lmdks) == len(seq):
|
||||||
|
return uniform(seq, lmdks, epsilon)
|
||||||
|
# # In case of seq == lmdks
|
||||||
|
# l = 0
|
||||||
|
# N = epsilon/len(seq)
|
||||||
|
# Otherwise
|
||||||
|
bgts = np.zeros([len(seq)])
|
||||||
|
if len(seq) != len(lmdks):
|
||||||
|
# Find worst case regural point
|
||||||
|
p_sel = 0
|
||||||
|
for p in seq:
|
||||||
|
if not np.any(lmdks[:] == p):
|
||||||
|
p_sel = p
|
||||||
|
break
|
||||||
|
# List all landmark timestamps with the worst regular
|
||||||
|
points = np.append(lmdks, [p_sel])
|
||||||
|
points = np.sort(points)
|
||||||
|
# epsilon_t = N*e^-l*t
|
||||||
|
l = sp.symbols('l', real=True)
|
||||||
|
# Cumulative allocation at landmarks and one extra point, i.e., L union {t}
|
||||||
|
T = seq.max()
|
||||||
|
# Bounding the privacy budgets at L union {t}
|
||||||
|
# epsilon = sum(N*e^-l*t) for t in L union {t}
|
||||||
|
t = sp.symbols('t')
|
||||||
|
eq = 0
|
||||||
|
for t in points:
|
||||||
|
eq += (((epsilon/len(seq))/(sp.exp(-1*l*T)))*sp.exp(-1*l*t))
|
||||||
|
try:
|
||||||
|
l = sp.solve(eq - epsilon, simplify=False, rational=False)[0]
|
||||||
|
# l = sp.solveset(eq <= epsilon, l, sp.S.Reals).sup
|
||||||
|
except Exception:
|
||||||
|
return bgts
|
||||||
|
# Allocate to the last point T epsilon/len(seq), i.e., user-level
|
||||||
|
N = (epsilon/len(seq))/sp.exp(-1*l*T)
|
||||||
|
# Allocate the budget
|
||||||
|
for i, t in enumerate(seq):
|
||||||
|
bgts[i] = N*sp.exp(-1*l*t)
|
||||||
|
return bgts
|
||||||
|
|
||||||
|
|
||||||
|
def linear_zero(seq, lmdks, epsilon):
|
||||||
|
'''
|
||||||
|
Linear uniform budget allocation (max to zero).
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
seq - The point sequence.
|
||||||
|
lmdks - The landmarks.
|
||||||
|
epsilon - The available privacy budget.
|
||||||
|
Returns:
|
||||||
|
bgts - The privacy budget allocation.
|
||||||
|
'''
|
||||||
|
bgts = np.zeros([len(seq)])
|
||||||
|
# Find worst case regural point
|
||||||
|
p_sel = 0
|
||||||
|
for p in seq:
|
||||||
|
if not np.any(lmdks[:] == p):
|
||||||
|
p_sel = p
|
||||||
|
break
|
||||||
|
# Sum all landmark timestamps with the worst regular
|
||||||
|
sum_cur = p_sel
|
||||||
|
for p in lmdks:
|
||||||
|
sum_cur += p
|
||||||
|
# epsilon_t = a*t + b
|
||||||
|
b = sp.symbols('b')
|
||||||
|
# Cumulative allocation at landmarks and one extra point
|
||||||
|
T = seq[len(seq) - 1]
|
||||||
|
b = sp.solve(b - ((b/T)*sum_cur + epsilon)/(len(lmdks) + 1))[0]
|
||||||
|
# Allocate to the last point 0
|
||||||
|
a = -b/T
|
||||||
|
# Allocate the budget
|
||||||
|
for i, t in enumerate(seq):
|
||||||
|
bgts[i] = a*t + b
|
||||||
|
return bgts
|
||||||
|
|
||||||
|
|
||||||
|
def linear(seq, lmdks, epsilon):
|
||||||
|
'''
|
||||||
|
Linear uniform budget allocation (max to user-level).
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
seq - The point sequence.
|
||||||
|
lmdks - The landmarks.
|
||||||
|
epsilon - The available privacy budget.
|
||||||
|
Returns:
|
||||||
|
bgts - The privacy budget allocation.
|
||||||
|
'''
|
||||||
|
# Fallback to uniform if zero or max landmarks
|
||||||
|
if len(lmdks) == 0 or len(lmdks) == len(seq):
|
||||||
|
return uniform(seq, lmdks, epsilon)
|
||||||
|
# Find worst case regural point
|
||||||
|
p_sel = 0
|
||||||
|
for p in seq:
|
||||||
|
if not np.any(lmdks[:] == p):
|
||||||
|
p_sel = p
|
||||||
|
break
|
||||||
|
# Sum all landmark timestamps with the worst regular
|
||||||
|
sum_cur = p_sel + np.sum(lmdks)
|
||||||
|
# epsilon_t = a*t + b
|
||||||
|
b = sp.symbols('b', real=True)
|
||||||
|
# Cumulative allocation at landmarks and one extra point, i.e., L union {t}
|
||||||
|
T = seq.max()
|
||||||
|
# Number of points to take into account
|
||||||
|
k = len(lmdks) + 1
|
||||||
|
# if len(lmdks) == len(seq):
|
||||||
|
# k = len(lmdks)
|
||||||
|
# Bounding the privacy budgets at L union {t}
|
||||||
|
# epsilon = a*sum(L union {t}) + |L union {t}|*b
|
||||||
|
b = sp.solve(b + (((epsilon/len(seq) - b)/T)*sum_cur - epsilon)/k, simplify=False, rational=False)[0]
|
||||||
|
# Allocate to the last point T epsilon/len(seq), i.e., user-level
|
||||||
|
a = (epsilon/len(seq) - b)/T
|
||||||
|
# Allocate the budget
|
||||||
|
bgts = np.zeros([len(seq)])
|
||||||
|
for i, t in enumerate(seq):
|
||||||
|
bgts[i] = a*t + b
|
||||||
|
return bgts
|
||||||
|
|
||||||
|
|
||||||
|
def plot_line(x_i, arr, label, marker, line):
|
||||||
|
plt.plot(x_i,
|
||||||
|
arr,
|
||||||
|
label=label,
|
||||||
|
color='black',
|
||||||
|
marker=marker,
|
||||||
|
linestyle=line,
|
||||||
|
# linewidth=1.0,
|
||||||
|
markerfacecolor='none')
|
||||||
|
|
||||||
|
|
||||||
|
def stepped(seq, lmdks, epsilon):
|
||||||
|
'''
|
||||||
|
Stepped budget allocation.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
seq - The point sequence.
|
||||||
|
lmdks - The landmarks.
|
||||||
|
epsilon - The available privacy budget.
|
||||||
|
Returns:
|
||||||
|
bgts - The privacy budget allocation.
|
||||||
|
'''
|
||||||
|
bgts = np.zeros([len(seq)])
|
||||||
|
epsilon_avail = epsilon/2
|
||||||
|
for i, p in enumerate(seq):
|
||||||
|
# Reduce the available budget when a landmark is reached
|
||||||
|
if np.any(lmdks[:] == p):
|
||||||
|
epsilon_avail = epsilon_avail/2
|
||||||
|
bgts[i] = epsilon_avail
|
||||||
|
return bgts
|
||||||
|
|
||||||
|
|
||||||
|
def adaptive(seq, lmdks, epsilon):
|
||||||
|
'''
|
||||||
|
Adaptive budget allocation.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
seq - The point sequence.
|
||||||
|
lmdks - The landmarks.
|
||||||
|
epsilon - The available privacy budget.
|
||||||
|
Returns:
|
||||||
|
rls_data - The perturbed data.
|
||||||
|
bgts - The privacy budget allocation.
|
||||||
|
skipped - The number of skipped releases.
|
||||||
|
'''
|
||||||
|
# Uniform budget allocation
|
||||||
|
bgts = uniform(seq, lmdks, epsilon)
|
||||||
|
# Released
|
||||||
|
rls_data = [None]*len(seq)
|
||||||
|
# The sampling rate
|
||||||
|
samp_rt = 1
|
||||||
|
# Track landmarks
|
||||||
|
lmdk_cur = 0
|
||||||
|
# Track skipped releases
|
||||||
|
skipped = 0
|
||||||
|
for i, p in enumerate(seq):
|
||||||
|
# Check if current point is a landmark
|
||||||
|
if lmdk_lib.is_landmark(p, lmdks):
|
||||||
|
lmdk_cur += 1
|
||||||
|
# Get coordinates
|
||||||
|
loc = (p[1], p[2])
|
||||||
|
if lmdk_lib.should_sample(samp_rt) or i == 0:
|
||||||
|
# Add noise to original data
|
||||||
|
new_loc = lmdk_lib.add_polar_noise(loc, bgts[i])
|
||||||
|
rls_data[i] = [p[0], new_loc[0], new_loc[1], p[3]]
|
||||||
|
# Adjust sampling rate
|
||||||
|
if i > 0:
|
||||||
|
if distance((rls_data[i - 1][1], rls_data[i - 1][2]), new_loc).km*1000 < 1/bgts[i]:
|
||||||
|
# Decrease
|
||||||
|
# samp_rt -= samp_rt*.9
|
||||||
|
# samp_rt -= samp_rt*.75
|
||||||
|
samp_rt -= samp_rt*.5
|
||||||
|
# samp_rt -= samp_rt*.25
|
||||||
|
# samp_rt -= samp_rt*.1
|
||||||
|
else:
|
||||||
|
# Increase
|
||||||
|
# samp_rt += (1 - samp_rt)*.9
|
||||||
|
# samp_rt += (1 - samp_rt)*.75
|
||||||
|
samp_rt += (1 - samp_rt)*.5
|
||||||
|
# samp_rt += (1 - samp_rt)*.25
|
||||||
|
# samp_rt += (1 - samp_rt)*.1
|
||||||
|
else:
|
||||||
|
skipped += 1
|
||||||
|
# Skip current release and approximate with previous
|
||||||
|
rls_data[i] = rls_data[i - 1]
|
||||||
|
if lmdk_lib.is_landmark(p, lmdks):
|
||||||
|
# Allocate the current budget to the following releases uniformly
|
||||||
|
for j in range(i + 1, len(seq)):
|
||||||
|
bgts[j] += bgts[i]/(len(lmdks) - lmdk_cur + 1)
|
||||||
|
# No budget was spent
|
||||||
|
bgts[i] = 0
|
||||||
|
return rls_data, bgts, skipped
|
||||||
|
|
||||||
|
|
||||||
|
def skip(seq, lmdks, epsilon):
|
||||||
|
'''
|
||||||
|
Skip landmarks.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
seq - The point sequence.
|
||||||
|
lmdks - The landmarks.
|
||||||
|
epsilon - The available privacy budget.
|
||||||
|
Returns:
|
||||||
|
rls_data - The perturbed data.
|
||||||
|
bgts - The privacy budget allocation.
|
||||||
|
'''
|
||||||
|
# Event-level budget allocation
|
||||||
|
bgts = np.array(len(seq)*[epsilon])
|
||||||
|
# Released
|
||||||
|
rls_data = [None]*len(seq)
|
||||||
|
for i, p in enumerate(seq):
|
||||||
|
# Get coordinates
|
||||||
|
loc = (p[1], p[2])
|
||||||
|
# Add noise to original data
|
||||||
|
new_loc = lmdk_lib.add_polar_noise(loc, bgts[i])
|
||||||
|
# Check if current point is a landmark
|
||||||
|
if lmdk_lib.is_landmark(p, lmdks):
|
||||||
|
if i > 0:
|
||||||
|
# Approximate with previous location
|
||||||
|
new_loc = (rls_data[i - 1][1], rls_data[i - 1][2])
|
||||||
|
bgts[i] = 0
|
||||||
|
rls_data[i] = [p[0], new_loc[0], new_loc[1], p[3]]
|
||||||
|
return rls_data, bgts
|
||||||
|
|
||||||
|
|
||||||
|
def sample(seq, lmdks, epsilon):
|
||||||
|
'''
|
||||||
|
Publish randomly.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
seq - The point sequence.
|
||||||
|
lmdks - The landmarks.
|
||||||
|
epsilon - The available privacy budget.
|
||||||
|
Returns:
|
||||||
|
rls_data - The perturbed data.
|
||||||
|
bgts - The privacy budget allocation.
|
||||||
|
skipped - The number of skipped releases.
|
||||||
|
'''
|
||||||
|
# Uniform budget allocation
|
||||||
|
bgts = uniform(seq, lmdks, epsilon)
|
||||||
|
# Released
|
||||||
|
rls_data = [None]*len(seq)
|
||||||
|
# The sampling rate
|
||||||
|
# (publish with 50% chance)
|
||||||
|
samp_rt = .5
|
||||||
|
# Track landmarks
|
||||||
|
lmdk_cur = 0
|
||||||
|
# Track skipped releases
|
||||||
|
skipped = 0
|
||||||
|
for i, p in enumerate(seq):
|
||||||
|
# Check if current point is a landmark
|
||||||
|
if lmdk_lib.is_landmark(p, lmdks):
|
||||||
|
lmdk_cur += 1
|
||||||
|
# Get coordinates
|
||||||
|
loc = (p[1], p[2])
|
||||||
|
if i == 0 or lmdk_lib.should_sample(samp_rt):
|
||||||
|
# Add noise to original data
|
||||||
|
new_loc = lmdk_lib.add_polar_noise(loc, bgts[i])
|
||||||
|
rls_data[i] = [p[0], new_loc[0], new_loc[1], p[3]]
|
||||||
|
else:
|
||||||
|
skipped += 1
|
||||||
|
# Skip current release and approximate with previous
|
||||||
|
rls_data[i] = rls_data[i - 1]
|
||||||
|
if lmdk_lib.is_landmark(p, lmdks):
|
||||||
|
# Allocate the current budget to the following releases uniformly
|
||||||
|
for j in range(i + 1, len(seq)):
|
||||||
|
bgts[j] += bgts[i]/(len(lmdks) - lmdk_cur + 1)
|
||||||
|
# No budget was spent
|
||||||
|
bgts[i] = 0
|
||||||
|
return rls_data, bgts, skipped
|
||||||
|
|
||||||
|
|
||||||
|
def uniform_r(seq, lmdks, epsilon):
|
||||||
|
# Released
|
||||||
|
rls_data = [None]*len(seq)
|
||||||
|
# Budgets
|
||||||
|
bgts = uniform(seq, lmdks, epsilon)
|
||||||
|
for i, p in enumerate(seq):
|
||||||
|
# Get coordinates
|
||||||
|
loc = (p[1], p[2])
|
||||||
|
# Add noise to original data
|
||||||
|
new_loc = lmdk_lib.add_polar_noise(loc, bgts[i])
|
||||||
|
rls_data[i] = [p[0], new_loc[0], new_loc[1], p[3]]
|
||||||
|
return rls_data, bgts
|
||||||
|
|
||||||
|
|
||||||
|
def utility_analysis(seq, lmdks, o, epsilon):
|
||||||
|
'''
|
||||||
|
Analyze the utility.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
seq - The point sequence.
|
||||||
|
lmdks - The landmarks.
|
||||||
|
o - The perturbed data.
|
||||||
|
epsilon - The available privacy budget.
|
||||||
|
Returns:
|
||||||
|
Nothing.
|
||||||
|
'''
|
||||||
|
# Estimate Mean Absolute Error
|
||||||
|
mae = 0
|
||||||
|
# Compare with uniform
|
||||||
|
mae_u = 0
|
||||||
|
bgts_u = uniform(seq, lmdks, epsilon)
|
||||||
|
for i, p in enumerate(seq):
|
||||||
|
mae += distance((p[1], p[2]), (o[i][1], o[i][2])).km*1000/len(seq)
|
||||||
|
new_loc = lmdk_lib.add_polar_noise((p[1], p[2]), bgts_u[i])
|
||||||
|
mae_u += distance((p[1], p[2]), (new_loc[0], new_loc[1])).km*1000/len(seq)
|
||||||
|
|
||||||
|
print(
|
||||||
|
'\n########## Analysis ##########\n'
|
||||||
|
'MAE uniform : %.2fm\n'
|
||||||
|
'MAE current : %.2fm\n'
|
||||||
|
'Difference : %.2f%%\n'
|
||||||
|
'##############################\n'
|
||||||
|
%(mae_u, mae, 100*(mae - mae_u)/mae_u)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def mae(seq, o):
|
||||||
|
mae = 0
|
||||||
|
for i, p in enumerate(seq):
|
||||||
|
mae += distance((p[1], p[2]), (o[i][1], o[i][2])).km*1000/len(seq)
|
||||||
|
return mae
|
967
code/lib/lmdk_lib.py
Normal file
967
code/lib/lmdk_lib.py
Normal file
@ -0,0 +1,967 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
from datetime import datetime
|
||||||
|
from geopy.distance import distance
|
||||||
|
import heapq
|
||||||
|
import itertools
|
||||||
|
from scipy.special import lambertw
|
||||||
|
import math
|
||||||
|
import numpy as np
|
||||||
|
import os
|
||||||
|
import random
|
||||||
|
import scipy.stats as stats
|
||||||
|
import time
|
||||||
|
from matplotlib import pyplot as plt
|
||||||
|
import zipfile
|
||||||
|
|
||||||
|
|
||||||
|
# Plot globals
|
||||||
|
dpi = 300
|
||||||
|
font_size = 24.0
|
||||||
|
line_width = 2.0
|
||||||
|
marker_size = 14.0
|
||||||
|
tick_length = 8.0
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def add_polar_noise(loc, epsilon):
|
||||||
|
'''
|
||||||
|
Add noise from planar Laplace.
|
||||||
|
[https://github.com/chatziko/location-guard/blob/master/src/js/common/laplace.js]
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
loc - The original location
|
||||||
|
loc - The original location
|
||||||
|
loc - The original location
|
||||||
|
epsilon - The privacy budget
|
||||||
|
Returns:
|
||||||
|
The perturbed location (lat, lng)
|
||||||
|
'''
|
||||||
|
# The bearing in radians.
|
||||||
|
# Random number in [0, 2*pi),
|
||||||
|
theta = np.random.uniform(low = 0, high = np.pi*2)
|
||||||
|
# The distance in meters.
|
||||||
|
# Use the inverse cumulative polar laplacian distribution function.
|
||||||
|
r = -np.real((lambertw((np.random.uniform(low = 0, high = 1) - 1)/np.e, k = -1) + 1)/epsilon)
|
||||||
|
new_loc = None
|
||||||
|
while not is_valid(new_loc):
|
||||||
|
new_loc = distance(kilometers = r/1000).destination(point = loc, bearing = np.degrees(theta))
|
||||||
|
return new_loc
|
||||||
|
|
||||||
|
|
||||||
|
def draw_line(line, x, label, marker):
|
||||||
|
axis_x = list(range(len(x)))
|
||||||
|
# plt.xticks(axis_x, x)
|
||||||
|
plt.plot(axis_x, x, label=label, marker=marker)
|
||||||
|
|
||||||
|
|
||||||
|
def eval_seq(dists):
|
||||||
|
'''
|
||||||
|
Calculate the standard deviation of a list of distances.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
dists - A list of distances.
|
||||||
|
Returns:
|
||||||
|
The standard deviation of the distances.
|
||||||
|
'''
|
||||||
|
return np.std(dists)
|
||||||
|
|
||||||
|
|
||||||
|
def get_abs_dists(seq, comb, lmdks):
|
||||||
|
'''
|
||||||
|
Get the distances of the points in a combination from the start and end of the sequence.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
seq - The point sequence.
|
||||||
|
comb - A possible point combination for a specified size.
|
||||||
|
lmdks - The landmarks.
|
||||||
|
Returns:
|
||||||
|
dists - The distances of the points in a combination.
|
||||||
|
'''
|
||||||
|
cur = np.append(comb, lmdks)
|
||||||
|
cur.sort()
|
||||||
|
start = seq[0]
|
||||||
|
end = seq[len(seq) - 1]
|
||||||
|
dists = np.zeros([len(cur)*2])
|
||||||
|
# Check if there are any points
|
||||||
|
if cur.size:
|
||||||
|
for i in range(0, len(cur)):
|
||||||
|
# From the start
|
||||||
|
dists[i*2] = abs(cur[i] - start)
|
||||||
|
# From the end
|
||||||
|
dists[i*2 + 1] = abs(cur[i] - end)
|
||||||
|
return dists
|
||||||
|
|
||||||
|
|
||||||
|
def get_combs(seq, size):
|
||||||
|
'''
|
||||||
|
Get all the possible landmark point combinations for a specified size.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
seq - The point sequence.
|
||||||
|
size - The desired number of landmarks.
|
||||||
|
Returns:
|
||||||
|
All the possible landmark combinations for a specified size.
|
||||||
|
'''
|
||||||
|
return itertools.combinations(seq, size)
|
||||||
|
|
||||||
|
def get_combs_h(h, hist, max):
|
||||||
|
combs = []
|
||||||
|
for comb in itertools.product(range(h + 1), repeat=len(hist)):
|
||||||
|
check = True
|
||||||
|
for i, c in enumerate(comb):
|
||||||
|
if sum(comb) > max or c < hist[i] or c > h:
|
||||||
|
check = False
|
||||||
|
break
|
||||||
|
if check:
|
||||||
|
combs.append(comb)
|
||||||
|
return combs
|
||||||
|
|
||||||
|
|
||||||
|
def get_emd(h1, h2):
|
||||||
|
'''
|
||||||
|
The earth mover's distance (EMD), or Wasserstein distance, of two histograms.
|
||||||
|
[https://stats.stackexchange.com/a/151362]
|
||||||
|
Pele et al., The quadratic-chi histogram distance family
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
h1 - A histogram.
|
||||||
|
h2 - Another histogram.
|
||||||
|
Return:
|
||||||
|
The EMD of the histograms.
|
||||||
|
'''
|
||||||
|
return stats.wasserstein_distance(h1,h2)
|
||||||
|
# return stats.wasserstein_distance(get_norm_hist(h1), get_norm_hist(h2))
|
||||||
|
|
||||||
|
|
||||||
|
def get_hist(seq, lmdks):
|
||||||
|
'''
|
||||||
|
Create a histogram from a set of landmarks.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
seq - The point sequence.
|
||||||
|
lmdks - The landmarks.
|
||||||
|
Return:
|
||||||
|
hist - The histogram of landmarks.
|
||||||
|
h - The bin size.
|
||||||
|
'''
|
||||||
|
# # Create a landmark sequence relative to the entire sequence
|
||||||
|
# lmdks_rel = np.array(lmdks)
|
||||||
|
# # Add the start of the sequence if not in landmarks
|
||||||
|
# if not np.any(lmdks_rel[:] == start):
|
||||||
|
# lmdks_rel = np.insert(lmdks_rel, 0, start)
|
||||||
|
# # Add the end of the sequence if not in landmarks
|
||||||
|
# if not np.any(lmdks_rel[:] == end):
|
||||||
|
# lmdks_rel = np.append(lmdks_rel, end)
|
||||||
|
|
||||||
|
# Dealing with zeros.
|
||||||
|
if len(seq) == 0 or len(lmdks) == 0:
|
||||||
|
return np.zeros(math.ceil(max(seq))), 1
|
||||||
|
|
||||||
|
# Interquartile range (IQR) is a measure of statistical dispersion, being equal to the difference between 75th and 25th percentiles, or between upper and lower quartiles.
|
||||||
|
# https://en.wikipedia.org/wiki/Interquartile_range
|
||||||
|
iqr = stats.iqr(lmdks)
|
||||||
|
|
||||||
|
# Use the Freedman–Diaconis rule to select the width of the bins to be used in the histogram.
|
||||||
|
# Robust (resilient to outliers) estimator that takes into account data variability and data size.
|
||||||
|
# https://en.wikipedia.org/wiki/Freedman%E2%80%93Diaconis_rule
|
||||||
|
h = 2*iqr*len(lmdks)**(-1/3)
|
||||||
|
if h < 1:
|
||||||
|
h = 1
|
||||||
|
# On the number of bins and width:
|
||||||
|
# https://en.wikipedia.org/wiki/Histogram#Number_of_bins_and_width
|
||||||
|
|
||||||
|
# Normalize the interval
|
||||||
|
h = math.ceil(max(seq)/math.ceil(max(seq)/math.ceil(h)))
|
||||||
|
|
||||||
|
# Create an empty histogram with intervals of size h
|
||||||
|
hist = np.zeros(math.ceil(max(seq)/h))
|
||||||
|
|
||||||
|
for lmdk in lmdks:
|
||||||
|
hist[int(lmdk/h) - 1] += 1
|
||||||
|
|
||||||
|
return hist, h
|
||||||
|
|
||||||
|
|
||||||
|
def get_lmdks(seq, size, dist):
|
||||||
|
'''
|
||||||
|
Get a landmark set for a distribution type.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
seq - The point sequence.
|
||||||
|
size - The landmarks' size.
|
||||||
|
dist - The distribution type code.
|
||||||
|
-1 - Left-skewed
|
||||||
|
0 - Symmetric
|
||||||
|
+1 - Right-skewed
|
||||||
|
+2 - Bimodal
|
||||||
|
+3 - Uniform
|
||||||
|
Return:
|
||||||
|
hist - The histogram of landmarks.
|
||||||
|
h - The bin size.
|
||||||
|
'''
|
||||||
|
scale = len(seq)/10
|
||||||
|
if dist == -1 or dist == 0 or dist == 1:
|
||||||
|
return MixtureModel([truncnorm(min(seq), max(seq), get_loc(seq, dist), scale)]).sample(min(seq), max(seq), [], size)
|
||||||
|
elif dist == 2:
|
||||||
|
return MixtureModel([
|
||||||
|
truncnorm(min(seq), max(seq), get_loc(seq, -1), scale),
|
||||||
|
truncnorm(min(seq), max(seq), get_loc(seq, +1), scale)
|
||||||
|
]).sample(min(seq), max(seq), [], size)
|
||||||
|
else:
|
||||||
|
return np.sort(np.random.choice(np.arange(min(seq), max(seq) + 1, 1), size=size, replace=False))
|
||||||
|
|
||||||
|
|
||||||
|
def get_lmdks_rand(seq, size, skew):
|
||||||
|
'''
|
||||||
|
Get a landmark set with a given skewness.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
seq - The point sequence.
|
||||||
|
size - The landmarks' size.
|
||||||
|
skew - The desired skewness.
|
||||||
|
Return:
|
||||||
|
hist - The histogram of landmarks.
|
||||||
|
h - The bin size.
|
||||||
|
'''
|
||||||
|
# Get all possible landmark combinations
|
||||||
|
lmdk_combs = get_combs(seq, size)
|
||||||
|
# Find a set of landmarks for the given requirements
|
||||||
|
lmdks = ()
|
||||||
|
for comb in lmdk_combs:
|
||||||
|
skew_cur = get_skew(comb)
|
||||||
|
# Check if skewness is close to the requirement or have the same sign
|
||||||
|
if math.isclose(skew_cur, skew, rel_tol=.1) or ((skew_cur < 0) == (skew < 0)):
|
||||||
|
lmdks = comb
|
||||||
|
break
|
||||||
|
return lmdks
|
||||||
|
|
||||||
|
|
||||||
|
def get_loc(seq, skew):
|
||||||
|
'''
|
||||||
|
Get the location of the sequence with a given skewness.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
seq - The point sequence.
|
||||||
|
skew - The desired skewness.
|
||||||
|
Return:
|
||||||
|
The location.
|
||||||
|
'''
|
||||||
|
loc_min = min(seq)
|
||||||
|
loc_max = max(seq)
|
||||||
|
if skew < 0:
|
||||||
|
loc_min = (max(seq) - min(seq))/2
|
||||||
|
elif skew > 0:
|
||||||
|
loc_max = (max(seq) - min(seq))/2
|
||||||
|
return int(loc_min + (loc_max - loc_min)/2)
|
||||||
|
|
||||||
|
|
||||||
|
def get_mean(seq, hist, h):
|
||||||
|
'''
|
||||||
|
Get the mean of the histogram.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
seq - The point sequence.
|
||||||
|
hist - The histogram of landmarks.
|
||||||
|
h - The bin size.
|
||||||
|
Return:
|
||||||
|
The mean of the histogram.
|
||||||
|
'''
|
||||||
|
sum = 0
|
||||||
|
for idx, count in enumerate(hist):
|
||||||
|
# Find bin limits
|
||||||
|
start = min(seq) + h*idx
|
||||||
|
end = min(seq) + h*(idx + 1)
|
||||||
|
if(end > max(seq)):
|
||||||
|
end = max(seq)
|
||||||
|
|
||||||
|
sum += (start + (end - start)/2)*count
|
||||||
|
|
||||||
|
return sum/np.sum(hist)
|
||||||
|
|
||||||
|
|
||||||
|
def get_norm(h1, h2):
|
||||||
|
'''
|
||||||
|
The Euclidean distance of two histograms.
|
||||||
|
[https://stackoverflow.com/a/1401828/13123075]
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
h1 - A histogram.
|
||||||
|
h2 - Another histogram.
|
||||||
|
Return:
|
||||||
|
The Euclidean distance of the histograms.
|
||||||
|
'''
|
||||||
|
return np.linalg.norm(h1 - h2)
|
||||||
|
|
||||||
|
|
||||||
|
def get_norm_hist(hist):
|
||||||
|
'''
|
||||||
|
Normalize the histogram.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
hist - The original histogram.
|
||||||
|
Return:
|
||||||
|
The normalized histogram.
|
||||||
|
'''
|
||||||
|
# In-place type conversion
|
||||||
|
norm_hist = hist.astype(np.float32)
|
||||||
|
n = np.sum(norm_hist)
|
||||||
|
for i, a in enumerate(norm_hist):
|
||||||
|
if a:
|
||||||
|
norm_hist[i] = a/n
|
||||||
|
return norm_hist
|
||||||
|
|
||||||
|
|
||||||
|
def get_opts(seq, lmdks):
|
||||||
|
'''
|
||||||
|
Finds all the possible valid options.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
seq - The point sequence.
|
||||||
|
lmdks - The landmarks.
|
||||||
|
Returns:
|
||||||
|
A list with all the possible options.
|
||||||
|
|
||||||
|
Requirements:
|
||||||
|
n = Regular points
|
||||||
|
r = The size of a combination
|
||||||
|
Time - O(C(n, r) + 2^C(n, r)), because O(n*C(n, r)) = O(n*n^min(r, n - r))
|
||||||
|
Space - O(r*C(n, r))
|
||||||
|
'''
|
||||||
|
reg = get_reg(seq, lmdks)
|
||||||
|
# Find all possible combinations for every k
|
||||||
|
combs = []
|
||||||
|
for k in range(len(lmdks) + 1, seq[len(seq) - 1] + 1):
|
||||||
|
combs.append(get_combs(reg, k - len(lmdks)))
|
||||||
|
|
||||||
|
# Find all possible options for all combinations
|
||||||
|
opts = itertools.product(*combs)
|
||||||
|
|
||||||
|
# Keep only valid options, i.e., larger sets must contain every element of every smaller set
|
||||||
|
rslts = []
|
||||||
|
for opt in opts:
|
||||||
|
for i, _ in enumerate(opt):
|
||||||
|
if i and not set(opt[i - 1]).issubset(opt[i]):
|
||||||
|
break
|
||||||
|
if i == len(opt) - 1:
|
||||||
|
rslts.append(opt)
|
||||||
|
|
||||||
|
return rslts
|
||||||
|
|
||||||
|
|
||||||
|
def get_reg(seq, lmdks):
|
||||||
|
'''
|
||||||
|
Get the regular points, i.e., non-landmarks, in a sequence.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
seq - The point sequence.
|
||||||
|
lmdks - The landmarks.
|
||||||
|
Returns:
|
||||||
|
The non-landmark points in a sequence.
|
||||||
|
'''
|
||||||
|
return np.array([i for i in seq if i not in lmdks])
|
||||||
|
|
||||||
|
|
||||||
|
def get_rel_dists(seq, comb, lmdks):
|
||||||
|
'''
|
||||||
|
Get the distances of the points in a combination from the nearest point of reference.
|
||||||
|
That is, the previous/next point or the start/end if closer.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
seq - The point sequence.
|
||||||
|
comb - A possible point combination for a specified size.
|
||||||
|
lmdks - The landmarks.
|
||||||
|
Returns:
|
||||||
|
dists - The distances of the points in a combination.
|
||||||
|
'''
|
||||||
|
# TODO: Review distances. Maybe take into account both ways?
|
||||||
|
cur = np.append(comb, lmdks)
|
||||||
|
cur.sort()
|
||||||
|
start = seq[0]
|
||||||
|
end = seq[len(seq) - 1]
|
||||||
|
dists = np.zeros([len(cur) + 1])
|
||||||
|
# Check if there are any points
|
||||||
|
if cur.size:
|
||||||
|
# First
|
||||||
|
dists[0] = abs(cur[0] - start)
|
||||||
|
# In-between
|
||||||
|
for i in range(0, len(cur) - 1):
|
||||||
|
dists[i + 1] = abs(cur[i] - cur[i + 1])
|
||||||
|
# Last
|
||||||
|
dists[len(cur)] = abs(cur[len(cur) - 1] - end)
|
||||||
|
return dists
|
||||||
|
|
||||||
|
|
||||||
|
def get_seq(start, end):
|
||||||
|
'''
|
||||||
|
Get a sequence of points.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
start - The starting point of the sequence.
|
||||||
|
end - The ending point of the sequence.
|
||||||
|
Returns:
|
||||||
|
A point sequence.
|
||||||
|
'''
|
||||||
|
return np.arange(start, end + 1)
|
||||||
|
|
||||||
|
|
||||||
|
def get_shannon_ent(a):
|
||||||
|
'''
|
||||||
|
Get the Shannon entropy of an array.
|
||||||
|
It can be used to calculate the uniformity of a histogram.
|
||||||
|
Uniform probability yields maximum uncertainty and therefore maximum entropy.
|
||||||
|
Entropy, then, can only decrease from the value associated with uniform probability.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
a - An array of integers
|
||||||
|
Return:
|
||||||
|
The Shannon entropy.
|
||||||
|
'''
|
||||||
|
# Keep only the non-zero elements of array a because log2(0) is -inf.
|
||||||
|
# The entropy of an event with zero probability is 0.
|
||||||
|
a = a[a != 0]
|
||||||
|
# Histograms are discrete probability distributions.
|
||||||
|
# We convert the counts to probabilities.
|
||||||
|
p_a = a/np.sum(a)
|
||||||
|
# Base 2 gives the unit of bits (or 'shannons').
|
||||||
|
return -np.sum(p_a*np.log2(p_a))
|
||||||
|
|
||||||
|
|
||||||
|
def get_shannon_ent_norm(a):
|
||||||
|
'''
|
||||||
|
Get the normalized Shannon entropy of an array.
|
||||||
|
It ranges from 0 (high entropy - close to uniform) to 1 (low entropy).
|
||||||
|
It ranges from 0 (high entropy - close to uniform) to 1 (low entropy).
|
||||||
|
It ranges from 0 (high entropy - close to uniform) to 1 (low entropy).
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
a - An array of integers
|
||||||
|
Return:
|
||||||
|
The normalized Shannon entropy.
|
||||||
|
'''
|
||||||
|
return 1 - get_shannon_ent(a)/np.log2(len(a))
|
||||||
|
|
||||||
|
|
||||||
|
def get_skew(lmdks):
|
||||||
|
'''
|
||||||
|
Fisher-Pearson coefficient of skewness.
|
||||||
|
negative - left (more data items right)
|
||||||
|
zero - symmetric data
|
||||||
|
positive - right (more data items left)
|
||||||
|
positive - right (more data items left)
|
||||||
|
positive - right (more data items left)
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
seq - The point sequence.
|
||||||
|
hist - The histogram of landmarks.
|
||||||
|
h - The bin size.
|
||||||
|
Return:
|
||||||
|
Fisher-Pearson coefficient of skewness.
|
||||||
|
'''
|
||||||
|
# def get_skew(seq, hist, h):
|
||||||
|
# return get_third(seq, hist, h)/get_std(seq, hist, h)**3
|
||||||
|
return stats.skew(lmdks)
|
||||||
|
|
||||||
|
|
||||||
|
def get_std(seq, hist, h):
|
||||||
|
'''
|
||||||
|
Get the standard deviation.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
seq - The point sequence.
|
||||||
|
hist - The histogram of landmarks.
|
||||||
|
h - The bin size.
|
||||||
|
Return:
|
||||||
|
The standard deviation.
|
||||||
|
'''
|
||||||
|
hist_mean = get_mean(seq, hist, h)
|
||||||
|
|
||||||
|
sum = 0
|
||||||
|
for idx, count in enumerate(hist):
|
||||||
|
# Find bin limits
|
||||||
|
start = min(seq) + h*idx
|
||||||
|
end = min(seq) + h*(idx + 1)
|
||||||
|
if(end > max(seq)):
|
||||||
|
end = max(seq)
|
||||||
|
|
||||||
|
sum += (((start + (end - start)/2) - hist_mean)**2)*count
|
||||||
|
|
||||||
|
return sum/np.sum(hist)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def get_third(seq, hist, h):
|
||||||
|
'''
|
||||||
|
Get the third central moment.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
seq - The point sequence.
|
||||||
|
hist - The histogram of landmarks.
|
||||||
|
h - The bin size.
|
||||||
|
Return:
|
||||||
|
The third central moment.
|
||||||
|
'''
|
||||||
|
hist_mean = get_mean(seq, hist, h)
|
||||||
|
|
||||||
|
sum = 0
|
||||||
|
for idx, count in enumerate(hist):
|
||||||
|
# Find bin limits
|
||||||
|
start = min(seq) + h*idx
|
||||||
|
end = min(seq) + h*(idx + 1)
|
||||||
|
if(end > max(seq)):
|
||||||
|
end = max(seq)
|
||||||
|
|
||||||
|
sum += (((start + (end - start)/2) - hist_mean)**3)*count
|
||||||
|
|
||||||
|
return sum/np.sum(hist)
|
||||||
|
|
||||||
|
|
||||||
|
def dist_type_to_str(d):
|
||||||
|
'''
|
||||||
|
Convert the distribution type code to string.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
t - The distribution type code.
|
||||||
|
-1 - Left-skewed
|
||||||
|
0 - Symmetric
|
||||||
|
+1 - Right-skewed
|
||||||
|
+2 - Bimodal
|
||||||
|
+3 - Uniform
|
||||||
|
Return:
|
||||||
|
The distribution type.
|
||||||
|
'''
|
||||||
|
if d == -1:
|
||||||
|
return 'Left-skewed'
|
||||||
|
elif d == 0:
|
||||||
|
return 'Symmetric'
|
||||||
|
elif d == 1:
|
||||||
|
return 'Right-skewed'
|
||||||
|
elif d == 2:
|
||||||
|
return 'Bimodal'
|
||||||
|
elif d == 3:
|
||||||
|
return 'Uniform'
|
||||||
|
else:
|
||||||
|
return 'Undefined'
|
||||||
|
|
||||||
|
|
||||||
|
class MixtureModel(stats.rv_continuous):
|
||||||
|
'''
|
||||||
|
A mixture model of continuous probability distributions
|
||||||
|
[https://stackoverflow.com/a/47763145/13123075]
|
||||||
|
'''
|
||||||
|
def __init__(self, submodels, *args, **kwargs):
|
||||||
|
super().__init__(*args, **kwargs)
|
||||||
|
self.submodels = submodels
|
||||||
|
|
||||||
|
def _pdf(self, x):
|
||||||
|
pdf = self.submodels[0].pdf(x)
|
||||||
|
for submodel in self.submodels[1:]:
|
||||||
|
pdf += submodel.pdf(x)
|
||||||
|
pdf /= len(self.submodels)
|
||||||
|
return pdf
|
||||||
|
|
||||||
|
def rvs(self, size):
|
||||||
|
submodel_choices = np.random.randint(len(self.submodels), size=size)
|
||||||
|
submodel_samples = [submodel.rvs(size=size) for submodel in self.submodels]
|
||||||
|
rvs = np.choose(submodel_choices, submodel_samples)
|
||||||
|
return rvs
|
||||||
|
|
||||||
|
def sample(self, lower, upper, sample, size):
|
||||||
|
'''
|
||||||
|
Sample from the mixture model without replacement.
|
||||||
|
[https://stackoverflow.com/a/20548895/13123075]
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
self - The mixture model.
|
||||||
|
lower - The lower bound of the range.
|
||||||
|
upper - The upper bound of the range.
|
||||||
|
sample - Items to be excluded.
|
||||||
|
size - The sample size.
|
||||||
|
Returns:
|
||||||
|
A sample.
|
||||||
|
'''
|
||||||
|
w = self.pdf(range(int(lower), int(upper) + 1))
|
||||||
|
idx = []
|
||||||
|
for i in range(int(lower), int(upper) + 1):
|
||||||
|
if i not in sample:
|
||||||
|
idx.append(i)
|
||||||
|
elt = [(math.log(random.random()) / w[i - 1], i) for i in idx]
|
||||||
|
return np.sort([x[1] for x in heapq.nlargest(size, elt)])
|
||||||
|
|
||||||
|
|
||||||
|
def plot_init():
|
||||||
|
'''
|
||||||
|
Initialize the plot.
|
||||||
|
'''
|
||||||
|
# Reset
|
||||||
|
plt.close('all')
|
||||||
|
# Style
|
||||||
|
plt.style.use('classic')
|
||||||
|
# DPI
|
||||||
|
plt.figure(dpi=dpi)
|
||||||
|
# Font
|
||||||
|
plt.rc('font', family='sans-serif')
|
||||||
|
plt.rc('font', **{'sans-serif':['Liberation Sans']})
|
||||||
|
plt.rc('font', size=font_size)
|
||||||
|
# Grid
|
||||||
|
plt.setp(plt.figure().add_subplot(111).spines.values(), linewidth=line_width)
|
||||||
|
plt.grid(True, axis='y', linewidth=line_width)
|
||||||
|
# Ticks
|
||||||
|
plt.gca().tick_params(which='both', width=line_width)
|
||||||
|
plt.gca().tick_params(which='major', length=tick_length)
|
||||||
|
plt.gca().tick_params(which='minor', length=tick_length/2)
|
||||||
|
# Colors
|
||||||
|
plt.subplot(111).set_prop_cycle('color', ['#212121', '#616161', '#9e9e9e', '#bdbdbd', '#e0e0e0', 'f5f5f5'])
|
||||||
|
# Layout
|
||||||
|
plt.tight_layout()
|
||||||
|
|
||||||
|
|
||||||
|
def plot_legend():
|
||||||
|
'''
|
||||||
|
Initialize the plot legend.
|
||||||
|
'''
|
||||||
|
plt.legend(
|
||||||
|
loc='best',
|
||||||
|
fontsize=font_size,
|
||||||
|
numpoints=1,
|
||||||
|
borderpad=.2, # default: 0.4
|
||||||
|
labelspacing=.25, # default: 0.5
|
||||||
|
handlelength=2.0, # default: 2.0
|
||||||
|
handletextpad=.4, # default: 0.8
|
||||||
|
borderaxespad=.5, # default: 0.5
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def save_plot(path):
|
||||||
|
'''
|
||||||
|
Save the plot to a file.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
path - The desired location.
|
||||||
|
Returns:
|
||||||
|
Nothing.
|
||||||
|
'''
|
||||||
|
# Save plot
|
||||||
|
os.makedirs(os.path.dirname(path), exist_ok=True)
|
||||||
|
plt.savefig(path, bbox_inches='tight')
|
||||||
|
# Clean up
|
||||||
|
plt.clf()
|
||||||
|
plt.close()
|
||||||
|
|
||||||
|
|
||||||
|
def print_lmdks(seq, lmdks):
|
||||||
|
'''
|
||||||
|
Print the landmarks.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
seq - The point sequence.
|
||||||
|
lmdks - The landmarks.
|
||||||
|
Returns:
|
||||||
|
Nothing.
|
||||||
|
'''
|
||||||
|
print('\n######### Landmarks ##########')
|
||||||
|
print(lmdks)
|
||||||
|
print('[', end='', flush=True)
|
||||||
|
for p in seq:
|
||||||
|
if np.any(lmdks[:] == p):
|
||||||
|
print('\'', end='', flush=True)
|
||||||
|
else:
|
||||||
|
print('.', end='', flush=True)
|
||||||
|
print(']', end='', flush=True)
|
||||||
|
print('\n##############################\n')
|
||||||
|
|
||||||
|
|
||||||
|
def plot_dist(seq, dist):
|
||||||
|
'''
|
||||||
|
Plot the probability distribution.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
seq - The point sequence.
|
||||||
|
size - The landmarks' size.
|
||||||
|
dist - The distribution type code.
|
||||||
|
-1 - Left-skewed
|
||||||
|
0 - Symmetric
|
||||||
|
+1 - Right-skewed
|
||||||
|
+2 - Bimodal
|
||||||
|
+3 - Uniform
|
||||||
|
Returns:
|
||||||
|
Nothing.
|
||||||
|
'''
|
||||||
|
scale = len(seq)/10
|
||||||
|
p = stats.uniform.pdf(seq)
|
||||||
|
if dist == -1 or dist == 0 or dist == 1:
|
||||||
|
p = MixtureModel([truncnorm(min(seq), max(seq), get_loc(seq, dist), scale)]).pdf(seq)
|
||||||
|
elif dist == 2:
|
||||||
|
p = MixtureModel([
|
||||||
|
truncnorm(min(seq), max(seq), get_loc(seq, -1), scale),
|
||||||
|
truncnorm(min(seq), max(seq), get_loc(seq, +1), scale)
|
||||||
|
]).pdf(seq)
|
||||||
|
plt.plot(seq, p)
|
||||||
|
plt.show()
|
||||||
|
|
||||||
|
|
||||||
|
def simplify_data(seq, lmdks):
|
||||||
|
'''
|
||||||
|
Get synthetic from real data.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
seq - The trajectory.
|
||||||
|
lmdks - The landmarks.
|
||||||
|
Returns:
|
||||||
|
The simplified sequence and landmarks.
|
||||||
|
'''
|
||||||
|
seq_s = get_seq(1, len(seq))
|
||||||
|
lmdks_s = []
|
||||||
|
for i, p in enumerate(seq):
|
||||||
|
if is_landmark(p, lmdks):
|
||||||
|
lmdks_s.append(i + 1)
|
||||||
|
return seq_s, np.array(lmdks_s)
|
||||||
|
|
||||||
|
|
||||||
|
def truncnorm(lower, upper, loc, scale):
|
||||||
|
'''
|
||||||
|
A truncated normal continuous random variable.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
lower - The lower bound of the range.
|
||||||
|
upper - The upper bound of the range.
|
||||||
|
loc - The location of the distribution.
|
||||||
|
scale - The spread of the distribution.
|
||||||
|
Returns:
|
||||||
|
A truncated normal continuous random variable.
|
||||||
|
'''
|
||||||
|
return stats.truncnorm((lower - loc)/scale, (upper - loc)/scale, loc, scale)
|
||||||
|
|
||||||
|
|
||||||
|
def is_valid(coord):
|
||||||
|
'''
|
||||||
|
Check if coordinates are valid.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
coord - The coordinates to be checked.
|
||||||
|
Returns:
|
||||||
|
True - Valid coordinates.
|
||||||
|
False - Invalid coordinates.
|
||||||
|
'''
|
||||||
|
if coord is None or (float(coord[0]) > 90 or float(coord[0]) < -90 # lat
|
||||||
|
or float(coord[1]) > 180 or float(coord[1]) < -180): #lng
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
def save_data(args, data, name):
|
||||||
|
'''
|
||||||
|
Save to file and compress.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
args - The execution arguments.
|
||||||
|
data - The data.
|
||||||
|
name - The name.
|
||||||
|
Returns:
|
||||||
|
Nothing.
|
||||||
|
'''
|
||||||
|
try:
|
||||||
|
path = os.path.dirname(os.path.abspath(args.res)) + '/' + name
|
||||||
|
if 'lmdks' in name:
|
||||||
|
path += '(dis=' + str(args.dist) + ', per=' + str(args.per) + ')'
|
||||||
|
print('Saving to %s... ' %(path), end='', flush=True)
|
||||||
|
with open(path, 'wb') as file:
|
||||||
|
_, idx = np.unique(data, axis=0, return_index=True)
|
||||||
|
np.save(file, data[np.sort(idx)], allow_pickle=False)
|
||||||
|
print('[OK]')
|
||||||
|
# Compress file
|
||||||
|
print('Compressing in %s... ' %(args.res), end='', flush=True)
|
||||||
|
with zipfile.ZipFile(args.res, 'a') as zip_file:
|
||||||
|
zip_file.write(path, os.path.basename(path))
|
||||||
|
print('[OK]')
|
||||||
|
# Delete file
|
||||||
|
print('Deleting %s... ' %(path), end='', flush=True)
|
||||||
|
os.remove(path)
|
||||||
|
print('[OK]')
|
||||||
|
except Exception as e:
|
||||||
|
print('[Error: %s]' %(e))
|
||||||
|
|
||||||
|
|
||||||
|
def load_data(args, name):
|
||||||
|
'''
|
||||||
|
Load data from file.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
args - The execution arguments.
|
||||||
|
data - The data.
|
||||||
|
name - The name.
|
||||||
|
Returns:
|
||||||
|
data - The data set.
|
||||||
|
0: uid, 1: lat, 2: lng, 3: tim
|
||||||
|
'''
|
||||||
|
data = np.array([])
|
||||||
|
try:
|
||||||
|
path = os.path.dirname(os.path.abspath(args.res)) + '/' + name
|
||||||
|
if 'lmdks' in name:
|
||||||
|
path += '(dis=' + str(args.dist) + ', per=' + str(args.per) + ')'
|
||||||
|
print('Loading %s... ' %(path), end='', flush=True)
|
||||||
|
with zipfile.ZipFile(args.res, 'r') as zip_file:
|
||||||
|
data = np.load(zip_file.open(os.path.basename(path)))
|
||||||
|
print('[OK]')
|
||||||
|
except Exception as e:
|
||||||
|
print('[Error: %s]' %(e))
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
def find_lmdks(usrs_data, args):
|
||||||
|
'''
|
||||||
|
Find users' landmarks.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
args - The execution arguments.
|
||||||
|
usrs_data - The users' data.
|
||||||
|
0: uid, 1: lat, 2: lng, 3: tim
|
||||||
|
Returns:
|
||||||
|
usrs_lmdks - The users' landmarks.
|
||||||
|
0: lid, 1: uid, 2: lat, 3: lng, 4: tim
|
||||||
|
'''
|
||||||
|
usrs_lmdks = np.empty((0,5), np.float32)
|
||||||
|
traj_cur = 0
|
||||||
|
lmdk_id = 0
|
||||||
|
usrs = np.unique(usrs_data[:,0])
|
||||||
|
for usr_i, usr in enumerate(usrs):
|
||||||
|
# Initialize user's landmarks list
|
||||||
|
lmdks = []
|
||||||
|
traj = usrs_data[usrs_data[:,0]==usr, :]
|
||||||
|
traj_cur += len(traj)
|
||||||
|
print(
|
||||||
|
'[%d%%] Points: %d/%d | Users: %d/%d... '
|
||||||
|
%((traj_cur/usrs_data.shape[0])*100, traj_cur, usrs_data.shape[0], usr, usrs[len(usrs) - 1]),
|
||||||
|
end='', flush=True
|
||||||
|
)
|
||||||
|
# Check the actual points
|
||||||
|
i = 0
|
||||||
|
while i < len(traj) - 1:
|
||||||
|
lmdk_cur = []
|
||||||
|
if is_valid((traj[i][1], traj[i][2])):
|
||||||
|
for j in range(i + 1, len(traj)):
|
||||||
|
if is_valid((traj[j][1], traj[j][2])):
|
||||||
|
# Add the beginning only the first time
|
||||||
|
if j == i + 1:
|
||||||
|
lmdk_cur.append([lmdk_id, usr, traj[i][1], traj[i][2], traj[i][3]])
|
||||||
|
# Add the new point
|
||||||
|
lmdk_cur.append([lmdk_id, usr, traj[j][1], traj[j][2], traj[j][3]])
|
||||||
|
# Distance in meters
|
||||||
|
dist = distance((traj[i][1], traj[i][2]), (traj[j][1], traj[j][2])).km*1000
|
||||||
|
# Distance exceeded or reached end of iteration
|
||||||
|
if dist > args.dist or j == len(traj) - 1:
|
||||||
|
# Period in minutes
|
||||||
|
per = abs(datetime.fromtimestamp(int(traj[i][3])) - datetime.fromtimestamp(int(traj[j][3]))).total_seconds()/60
|
||||||
|
# Check if enough time passed
|
||||||
|
if per > args.per:
|
||||||
|
# usrs_id starts from 1
|
||||||
|
lmdk_id += 1
|
||||||
|
# Assign id to current landmark
|
||||||
|
for l in lmdk_cur:
|
||||||
|
l[0] = lmdk_id
|
||||||
|
# Append current landmark
|
||||||
|
lmdks += lmdk_cur
|
||||||
|
# Continue checking from the current point
|
||||||
|
i = j
|
||||||
|
break
|
||||||
|
# No landmark was found, continue from next point
|
||||||
|
if i == 0 or not is_valid((traj[i][1], traj[i][2])) or 'j' not in vars() or i != j:
|
||||||
|
i += 1
|
||||||
|
print('[OK]')
|
||||||
|
if lmdks:
|
||||||
|
usrs_lmdks = np.append(usrs_lmdks, np.asarray(lmdks, dtype=np.float32), axis=0)
|
||||||
|
return usrs_lmdks
|
||||||
|
|
||||||
|
|
||||||
|
def lmdks_stats(args, usrs_lmdks):
|
||||||
|
'''
|
||||||
|
Generate landmarks' stats.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
args - The execution arguments.
|
||||||
|
usrs_lmdks - The user' landmarks.
|
||||||
|
0: lid, 1: uid, 2: lat, 3: lng, 4: tim
|
||||||
|
Returns:
|
||||||
|
Nothing.
|
||||||
|
'''
|
||||||
|
lmdks_stats = {
|
||||||
|
'total': 0,
|
||||||
|
'len': 0,
|
||||||
|
'count': {},
|
||||||
|
}
|
||||||
|
# Check every user
|
||||||
|
usrs = np.unique(usrs_lmdks[:, 1])
|
||||||
|
for usr in usrs:
|
||||||
|
print(
|
||||||
|
'[%d%%] Calculating landmark stats for user %d/%d... '
|
||||||
|
%(usr*100/len(usrs), usr, np.max(usrs)),
|
||||||
|
end='', flush=True
|
||||||
|
)
|
||||||
|
# Check each user's landmarks
|
||||||
|
for lmdk in np.unique(usrs_lmdks[usrs_lmdks[:, 1] == usr, 0]):
|
||||||
|
lmdk_traj = usrs_lmdks[usrs_lmdks[:, 0] == lmdk, :]
|
||||||
|
lmdks_stats['total'] += 1
|
||||||
|
lmdks_stats['len'] += len(lmdk_traj)
|
||||||
|
if lmdks_stats['count'].get(len(lmdk_traj)) == None:
|
||||||
|
lmdks_stats['count'][len(lmdk_traj)] = 1
|
||||||
|
else:
|
||||||
|
lmdks_stats['count'][len(lmdk_traj)] += 1
|
||||||
|
print('[OK]')
|
||||||
|
hist_min, hist_max = min(lmdks_stats['count'], key=int, default=0), max(lmdks_stats['count'], key=int, default=0)
|
||||||
|
# Make histogram
|
||||||
|
x = []
|
||||||
|
for i in range(hist_min, hist_max + 1):
|
||||||
|
if lmdks_stats['count'].get(i) != None:
|
||||||
|
x.extend([i]*lmdks_stats['count'].get(i))
|
||||||
|
# Show stats
|
||||||
|
print(
|
||||||
|
'\n############ Stats ###########\n'
|
||||||
|
'Landmarks : %d\n'
|
||||||
|
' Length\n'
|
||||||
|
' Total : %d (%.2f%%)\n'
|
||||||
|
' Minimum : %d\n'
|
||||||
|
' Maximum : %d\n'
|
||||||
|
'##############################\n'
|
||||||
|
%(lmdks_stats['total'], lmdks_stats['len'], (lmdks_stats['len']/args.time)*100, min(lmdks_stats['count'], key=int, default=0), max(lmdks_stats['count'], key=int, default=0))
|
||||||
|
)
|
||||||
|
# # Initialize plot
|
||||||
|
# plot_init()
|
||||||
|
# # Set x axis
|
||||||
|
# plt.xlabel('Landmarks sequence length')
|
||||||
|
# plt.xticks(rotation='vertical')
|
||||||
|
# # The y axis
|
||||||
|
# plt.ylabel('Number of sequences')
|
||||||
|
# plt.yscale('log')
|
||||||
|
# Create histogram
|
||||||
|
# plt.hist(x, bins=(hist_max - hist_min))
|
||||||
|
# Show plot
|
||||||
|
# plt.show()
|
||||||
|
# Save plot
|
||||||
|
# save_plot(os.path.dirname(args.arc) + '/' + 'results' + '(dis=' + str(args.dist) + ', per=' + str(args.per) + ').pdf')
|
||||||
|
|
||||||
|
|
||||||
|
def should_sample(samp_rt):
|
||||||
|
'''
|
||||||
|
Randomly decide to release with noise.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
samp_rt - The sampling rate (0, 1]
|
||||||
|
Returns:
|
||||||
|
True/False
|
||||||
|
'''
|
||||||
|
return random.choices(population=[True, False], weights=[samp_rt, 1 - samp_rt], k=1)[0]
|
||||||
|
|
||||||
|
|
||||||
|
def is_landmark(p, lmdks):
|
||||||
|
'''
|
||||||
|
Check is a point is a landmark.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
p - The point to check.
|
||||||
|
lmdks - A 2d array of landmarks.
|
||||||
|
Returns:
|
||||||
|
True/False
|
||||||
|
'''
|
||||||
|
if len(lmdks) and any(np.equal(lmdks[:, 1:5], p).all(1)):
|
||||||
|
return True
|
||||||
|
return False
|
Loading…
Reference in New Issue
Block a user