2021-07-25 15:51:51 +02:00
#!/usr/bin/env python3
2021-09-29 00:12:57 +02:00
import ast
2021-07-25 15:51:51 +02:00
from datetime import datetime
from geopy . distance import distance
import heapq
import itertools
from scipy . special import lambertw
import math
import numpy as np
import os
import random
import scipy . stats as stats
import time
from matplotlib import pyplot as plt
import zipfile
# Plot globals
dpi = 300
font_size = 24.0
line_width = 2.0
marker_size = 14.0
tick_length = 8.0
def add_polar_noise ( loc , epsilon ) :
'''
Add noise from planar Laplace .
[ https : / / github . com / chatziko / location - guard / blob / master / src / js / common / laplace . js ]
Parameters :
2021-09-29 00:54:08 +02:00
loc - The original location .
epsilon - The privacy budget .
2021-07-25 15:51:51 +02:00
Returns :
2021-09-29 00:54:08 +02:00
The perturbed location ( lat , lng ) .
2021-07-25 15:51:51 +02:00
'''
# The bearing in radians.
# Random number in [0, 2*pi),
theta = np . random . uniform ( low = 0 , high = np . pi * 2 )
# The distance in meters.
# Use the inverse cumulative polar laplacian distribution function.
r = - np . real ( ( lambertw ( ( np . random . uniform ( low = 0 , high = 1 ) - 1 ) / np . e , k = - 1 ) + 1 ) / epsilon )
new_loc = None
while not is_valid ( new_loc ) :
new_loc = distance ( kilometers = r / 1000 ) . destination ( point = loc , bearing = np . degrees ( theta ) )
return new_loc
2021-09-29 01:03:50 +02:00
def randomized_response ( r , epsilon ) :
'''
Return the original response with probability p ,
or flip the response with probability 1 - p .
[ https : / / www . usenix . org / system / files / conference / usenixsecurity17 / sec17 - wang - tianhao . pdf ]
Parameters :
r - The original response ( True / False ) .
epsilon - The privacy budget .
Returns :
The randomized response .
'''
# Return the original value
if random . uniform ( 0 , 1 ) < = math . exp ( epsilon ) / ( math . exp ( epsilon ) + 1 ) :
return r
# Flip the original value
else :
return not r
2021-09-29 19:56:38 +02:00
def add_laplace_noise ( value , sens , epsilon ) :
'''
Add random noise to a data value .
Parameteres :
value - The value to add noise to .
sens - The query function sensitivity .
epsilon - The privacy budget .
Returns :
The value with the noise .
'''
return np . random . laplace ( value , sens / epsilon )
2021-07-25 15:51:51 +02:00
def draw_line ( line , x , label , marker ) :
axis_x = list ( range ( len ( x ) ) )
# plt.xticks(axis_x, x)
plt . plot ( axis_x , x , label = label , marker = marker )
def eval_seq ( dists ) :
'''
Calculate the standard deviation of a list of distances .
Parameters :
dists - A list of distances .
Returns :
The standard deviation of the distances .
'''
return np . std ( dists )
def get_abs_dists ( seq , comb , lmdks ) :
'''
Get the distances of the points in a combination from the start and end of the sequence .
Parameters :
seq - The point sequence .
comb - A possible point combination for a specified size .
lmdks - The landmarks .
Returns :
dists - The distances of the points in a combination .
'''
cur = np . append ( comb , lmdks )
cur . sort ( )
start = seq [ 0 ]
end = seq [ len ( seq ) - 1 ]
dists = np . zeros ( [ len ( cur ) * 2 ] )
# Check if there are any points
if cur . size :
for i in range ( 0 , len ( cur ) ) :
# From the start
dists [ i * 2 ] = abs ( cur [ i ] - start )
# From the end
dists [ i * 2 + 1 ] = abs ( cur [ i ] - end )
return dists
def get_combs ( seq , size ) :
'''
Get all the possible landmark point combinations for a specified size .
Parameters :
seq - The point sequence .
size - The desired number of landmarks .
Returns :
All the possible landmark combinations for a specified size .
'''
return itertools . combinations ( seq , size )
def get_combs_h ( h , hist , max ) :
combs = [ ]
for comb in itertools . product ( range ( h + 1 ) , repeat = len ( hist ) ) :
check = True
for i , c in enumerate ( comb ) :
if sum ( comb ) > max or c < hist [ i ] or c > h :
check = False
break
if check :
combs . append ( comb )
return combs
def get_emd ( h1 , h2 ) :
'''
The earth mover ' s distance (EMD), or Wasserstein distance, of two histograms.
[ https : / / stats . stackexchange . com / a / 151362 ]
Pele et al . , The quadratic - chi histogram distance family
Parameters :
h1 - A histogram .
h2 - Another histogram .
Return :
The EMD of the histograms .
'''
return stats . wasserstein_distance ( h1 , h2 )
# return stats.wasserstein_distance(get_norm_hist(h1), get_norm_hist(h2))
def get_hist ( seq , lmdks ) :
'''
Create a histogram from a set of landmarks .
Parameters :
seq - The point sequence .
lmdks - The landmarks .
Return :
hist - The histogram of landmarks .
h - The bin size .
'''
# # Create a landmark sequence relative to the entire sequence
# lmdks_rel = np.array(lmdks)
# # Add the start of the sequence if not in landmarks
# if not np.any(lmdks_rel[:] == start):
# lmdks_rel = np.insert(lmdks_rel, 0, start)
# # Add the end of the sequence if not in landmarks
# if not np.any(lmdks_rel[:] == end):
# lmdks_rel = np.append(lmdks_rel, end)
# Dealing with zeros.
if len ( seq ) == 0 or len ( lmdks ) == 0 :
return np . zeros ( math . ceil ( max ( seq ) ) ) , 1
# Interquartile range (IQR) is a measure of statistical dispersion, being equal to the difference between 75th and 25th percentiles, or between upper and lower quartiles.
# https://en.wikipedia.org/wiki/Interquartile_range
iqr = stats . iqr ( lmdks )
# Use the Freedman– Diaconis rule to select the width of the bins to be used in the histogram.
# Robust (resilient to outliers) estimator that takes into account data variability and data size.
# https://en.wikipedia.org/wiki/Freedman%E2%80%93Diaconis_rule
h = 2 * iqr * len ( lmdks ) * * ( - 1 / 3 )
if h < 1 :
h = 1
# On the number of bins and width:
# https://en.wikipedia.org/wiki/Histogram#Number_of_bins_and_width
# Normalize the interval
h = math . ceil ( max ( seq ) / math . ceil ( max ( seq ) / math . ceil ( h ) ) )
# Create an empty histogram with intervals of size h
2021-10-06 10:18:14 +02:00
hist = np . zeros ( math . ceil ( max ( seq ) / h ) , dtype = int )
2021-07-25 15:51:51 +02:00
for lmdk in lmdks :
hist [ int ( lmdk / h ) - 1 ] + = 1
return hist , h
def get_lmdks ( seq , size , dist ) :
'''
Get a landmark set for a distribution type .
Parameters :
seq - The point sequence .
size - The landmarks ' size.
dist - The distribution type code .
- 1 - Left - skewed
0 - Symmetric
+ 1 - Right - skewed
+ 2 - Bimodal
+ 3 - Uniform
Return :
hist - The histogram of landmarks .
h - The bin size .
'''
scale = len ( seq ) / 10
if dist == - 1 or dist == 0 or dist == 1 :
return MixtureModel ( [ truncnorm ( min ( seq ) , max ( seq ) , get_loc ( seq , dist ) , scale ) ] ) . sample ( min ( seq ) , max ( seq ) , [ ] , size )
elif dist == 2 :
return MixtureModel ( [
truncnorm ( min ( seq ) , max ( seq ) , get_loc ( seq , - 1 ) , scale ) ,
truncnorm ( min ( seq ) , max ( seq ) , get_loc ( seq , + 1 ) , scale )
] ) . sample ( min ( seq ) , max ( seq ) , [ ] , size )
else :
return np . sort ( np . random . choice ( np . arange ( min ( seq ) , max ( seq ) + 1 , 1 ) , size = size , replace = False ) )
def get_lmdks_rand ( seq , size , skew ) :
'''
Get a landmark set with a given skewness .
Parameters :
seq - The point sequence .
size - The landmarks ' size.
skew - The desired skewness .
Return :
hist - The histogram of landmarks .
h - The bin size .
'''
# Get all possible landmark combinations
lmdk_combs = get_combs ( seq , size )
# Find a set of landmarks for the given requirements
lmdks = ( )
for comb in lmdk_combs :
skew_cur = get_skew ( comb )
# Check if skewness is close to the requirement or have the same sign
if math . isclose ( skew_cur , skew , rel_tol = .1 ) or ( ( skew_cur < 0 ) == ( skew < 0 ) ) :
lmdks = comb
break
return lmdks
def get_loc ( seq , skew ) :
'''
Get the location of the sequence with a given skewness .
Parameters :
seq - The point sequence .
skew - The desired skewness .
Return :
The location .
'''
loc_min = min ( seq )
loc_max = max ( seq )
if skew < 0 :
loc_min = ( max ( seq ) - min ( seq ) ) / 2
elif skew > 0 :
loc_max = ( max ( seq ) - min ( seq ) ) / 2
return int ( loc_min + ( loc_max - loc_min ) / 2 )
def get_mean ( seq , hist , h ) :
'''
Get the mean of the histogram .
Parameters :
seq - The point sequence .
hist - The histogram of landmarks .
h - The bin size .
Return :
The mean of the histogram .
'''
sum = 0
for idx , count in enumerate ( hist ) :
# Find bin limits
start = min ( seq ) + h * idx
end = min ( seq ) + h * ( idx + 1 )
if ( end > max ( seq ) ) :
end = max ( seq )
sum + = ( start + ( end - start ) / 2 ) * count
return sum / np . sum ( hist )
def get_norm ( h1 , h2 ) :
'''
The Euclidean distance of two histograms .
[ https : / / stackoverflow . com / a / 1401828 / 13123075 ]
Parameters :
h1 - A histogram .
h2 - Another histogram .
Return :
The Euclidean distance of the histograms .
'''
return np . linalg . norm ( h1 - h2 )
def get_norm_hist ( hist ) :
'''
Normalize the histogram .
Parameters :
hist - The original histogram .
Return :
The normalized histogram .
'''
# In-place type conversion
norm_hist = hist . astype ( np . float32 )
n = np . sum ( norm_hist )
for i , a in enumerate ( norm_hist ) :
if a :
norm_hist [ i ] = a / n
return norm_hist
def get_opts ( seq , lmdks ) :
'''
Finds all the possible valid options .
Parameters :
seq - The point sequence .
lmdks - The landmarks .
Returns :
A list with all the possible options .
Requirements :
n = Regular points
r = The size of a combination
Time - O ( C ( n , r ) + 2 ^ C ( n , r ) ) , because O ( n * C ( n , r ) ) = O ( n * n ^ min ( r , n - r ) )
Space - O ( r * C ( n , r ) )
'''
reg = get_reg ( seq , lmdks )
# Find all possible combinations for every k
combs = [ ]
for k in range ( len ( lmdks ) + 1 , seq [ len ( seq ) - 1 ] + 1 ) :
combs . append ( get_combs ( reg , k - len ( lmdks ) ) )
# Find all possible options for all combinations
opts = itertools . product ( * combs )
# Keep only valid options, i.e., larger sets must contain every element of every smaller set
rslts = [ ]
for opt in opts :
for i , _ in enumerate ( opt ) :
if i and not set ( opt [ i - 1 ] ) . issubset ( opt [ i ] ) :
break
if i == len ( opt ) - 1 :
rslts . append ( opt )
return rslts
def get_reg ( seq , lmdks ) :
'''
Get the regular points , i . e . , non - landmarks , in a sequence .
Parameters :
seq - The point sequence .
lmdks - The landmarks .
Returns :
The non - landmark points in a sequence .
'''
return np . array ( [ i for i in seq if i not in lmdks ] )
def get_rel_dists ( seq , comb , lmdks ) :
'''
Get the distances of the points in a combination from the nearest point of reference .
That is , the previous / next point or the start / end if closer .
Parameters :
seq - The point sequence .
comb - A possible point combination for a specified size .
lmdks - The landmarks .
Returns :
dists - The distances of the points in a combination .
'''
# TODO: Review distances. Maybe take into account both ways?
cur = np . append ( comb , lmdks )
cur . sort ( )
start = seq [ 0 ]
end = seq [ len ( seq ) - 1 ]
dists = np . zeros ( [ len ( cur ) + 1 ] )
# Check if there are any points
if cur . size :
# First
dists [ 0 ] = abs ( cur [ 0 ] - start )
# In-between
for i in range ( 0 , len ( cur ) - 1 ) :
dists [ i + 1 ] = abs ( cur [ i ] - cur [ i + 1 ] )
# Last
dists [ len ( cur ) ] = abs ( cur [ len ( cur ) - 1 ] - end )
return dists
def get_seq ( start , end ) :
'''
Get a sequence of points .
Parameters :
start - The starting point of the sequence .
end - The ending point of the sequence .
Returns :
A point sequence .
'''
return np . arange ( start , end + 1 )
def get_shannon_ent ( a ) :
'''
Get the Shannon entropy of an array .
It can be used to calculate the uniformity of a histogram .
Uniform probability yields maximum uncertainty and therefore maximum entropy .
Entropy , then , can only decrease from the value associated with uniform probability .
Parameters :
a - An array of integers
Return :
The Shannon entropy .
'''
# Keep only the non-zero elements of array a because log2(0) is -inf.
# The entropy of an event with zero probability is 0.
a = a [ a != 0 ]
# Histograms are discrete probability distributions.
# We convert the counts to probabilities.
p_a = a / np . sum ( a )
# Base 2 gives the unit of bits (or 'shannons').
return - np . sum ( p_a * np . log2 ( p_a ) )
def get_shannon_ent_norm ( a ) :
'''
Get the normalized Shannon entropy of an array .
It ranges from 0 ( high entropy - close to uniform ) to 1 ( low entropy ) .
It ranges from 0 ( high entropy - close to uniform ) to 1 ( low entropy ) .
It ranges from 0 ( high entropy - close to uniform ) to 1 ( low entropy ) .
Parameters :
a - An array of integers
Return :
The normalized Shannon entropy .
'''
return 1 - get_shannon_ent ( a ) / np . log2 ( len ( a ) )
def get_skew ( lmdks ) :
'''
Fisher - Pearson coefficient of skewness .
negative - left ( more data items right )
zero - symmetric data
positive - right ( more data items left )
positive - right ( more data items left )
positive - right ( more data items left )
Parameters :
seq - The point sequence .
hist - The histogram of landmarks .
h - The bin size .
Return :
Fisher - Pearson coefficient of skewness .
'''
# def get_skew(seq, hist, h):
# return get_third(seq, hist, h)/get_std(seq, hist, h)**3
return stats . skew ( lmdks )
def get_std ( seq , hist , h ) :
'''
Get the standard deviation .
Parameters :
seq - The point sequence .
hist - The histogram of landmarks .
h - The bin size .
Return :
The standard deviation .
'''
hist_mean = get_mean ( seq , hist , h )
sum = 0
for idx , count in enumerate ( hist ) :
# Find bin limits
start = min ( seq ) + h * idx
end = min ( seq ) + h * ( idx + 1 )
if ( end > max ( seq ) ) :
end = max ( seq )
sum + = ( ( ( start + ( end - start ) / 2 ) - hist_mean ) * * 2 ) * count
return sum / np . sum ( hist )
def get_third ( seq , hist , h ) :
'''
Get the third central moment .
Parameters :
seq - The point sequence .
hist - The histogram of landmarks .
h - The bin size .
Return :
The third central moment .
'''
hist_mean = get_mean ( seq , hist , h )
sum = 0
for idx , count in enumerate ( hist ) :
# Find bin limits
start = min ( seq ) + h * idx
end = min ( seq ) + h * ( idx + 1 )
if ( end > max ( seq ) ) :
end = max ( seq )
sum + = ( ( ( start + ( end - start ) / 2 ) - hist_mean ) * * 3 ) * count
return sum / np . sum ( hist )
def dist_type_to_str ( d ) :
'''
Convert the distribution type code to string .
Parameters :
t - The distribution type code .
- 1 - Left - skewed
0 - Symmetric
+ 1 - Right - skewed
+ 2 - Bimodal
+ 3 - Uniform
Return :
The distribution type .
'''
if d == - 1 :
return ' Left-skewed '
elif d == 0 :
return ' Symmetric '
elif d == 1 :
return ' Right-skewed '
elif d == 2 :
return ' Bimodal '
elif d == 3 :
return ' Uniform '
else :
return ' Undefined '
class MixtureModel ( stats . rv_continuous ) :
'''
A mixture model of continuous probability distributions
[ https : / / stackoverflow . com / a / 47763145 / 13123075 ]
'''
def __init__ ( self , submodels , * args , * * kwargs ) :
super ( ) . __init__ ( * args , * * kwargs )
self . submodels = submodels
def _pdf ( self , x ) :
pdf = self . submodels [ 0 ] . pdf ( x )
for submodel in self . submodels [ 1 : ] :
pdf + = submodel . pdf ( x )
pdf / = len ( self . submodels )
return pdf
def rvs ( self , size ) :
submodel_choices = np . random . randint ( len ( self . submodels ) , size = size )
submodel_samples = [ submodel . rvs ( size = size ) for submodel in self . submodels ]
rvs = np . choose ( submodel_choices , submodel_samples )
return rvs
def sample ( self , lower , upper , sample , size ) :
'''
Sample from the mixture model without replacement .
[ https : / / stackoverflow . com / a / 20548895 / 13123075 ]
Parameters :
self - The mixture model .
lower - The lower bound of the range .
upper - The upper bound of the range .
sample - Items to be excluded .
size - The sample size .
Returns :
A sample .
'''
w = self . pdf ( range ( int ( lower ) , int ( upper ) + 1 ) )
idx = [ ]
for i in range ( int ( lower ) , int ( upper ) + 1 ) :
if i not in sample :
idx . append ( i )
elt = [ ( math . log ( random . random ( ) ) / w [ i - 1 ] , i ) for i in idx ]
return np . sort ( [ x [ 1 ] for x in heapq . nlargest ( size , elt ) ] )
def plot_init ( ) :
'''
Initialize the plot .
'''
# Reset
plt . close ( ' all ' )
# Style
plt . style . use ( ' classic ' )
# DPI
plt . figure ( dpi = dpi )
# Font
plt . rc ( ' font ' , family = ' sans-serif ' )
plt . rc ( ' font ' , * * { ' sans-serif ' : [ ' Liberation Sans ' ] } )
plt . rc ( ' font ' , size = font_size )
# Grid
plt . setp ( plt . figure ( ) . add_subplot ( 111 ) . spines . values ( ) , linewidth = line_width )
plt . grid ( True , axis = ' y ' , linewidth = line_width )
# Ticks
plt . gca ( ) . tick_params ( which = ' both ' , width = line_width )
plt . gca ( ) . tick_params ( which = ' major ' , length = tick_length )
plt . gca ( ) . tick_params ( which = ' minor ' , length = tick_length / 2 )
# Colors
plt . subplot ( 111 ) . set_prop_cycle ( ' color ' , [ ' #212121 ' , ' #616161 ' , ' #9e9e9e ' , ' #bdbdbd ' , ' #e0e0e0 ' , ' f5f5f5 ' ] )
# Layout
plt . tight_layout ( )
def plot_legend ( ) :
'''
Initialize the plot legend .
'''
plt . legend (
loc = ' best ' ,
fontsize = font_size ,
numpoints = 1 ,
borderpad = .2 , # default: 0.4
labelspacing = .25 , # default: 0.5
handlelength = 2.0 , # default: 2.0
handletextpad = .4 , # default: 0.8
borderaxespad = .5 , # default: 0.5
)
def save_plot ( path ) :
'''
Save the plot to a file .
Parameters :
path - The desired location .
Returns :
Nothing .
'''
# Save plot
os . makedirs ( os . path . dirname ( path ) , exist_ok = True )
plt . savefig ( path , bbox_inches = ' tight ' )
# Clean up
plt . clf ( )
plt . close ( )
def print_lmdks ( seq , lmdks ) :
'''
Print the landmarks .
Parameters :
seq - The point sequence .
lmdks - The landmarks .
Returns :
Nothing .
'''
print ( ' \n ######### Landmarks ########## ' )
print ( lmdks )
print ( ' [ ' , end = ' ' , flush = True )
for p in seq :
if np . any ( lmdks [ : ] == p ) :
print ( ' \' ' , end = ' ' , flush = True )
else :
print ( ' . ' , end = ' ' , flush = True )
print ( ' ] ' , end = ' ' , flush = True )
print ( ' \n ############################## \n ' )
def plot_dist ( seq , dist ) :
'''
Plot the probability distribution .
Parameters :
seq - The point sequence .
size - The landmarks ' size.
dist - The distribution type code .
- 1 - Left - skewed
0 - Symmetric
+ 1 - Right - skewed
+ 2 - Bimodal
+ 3 - Uniform
Returns :
Nothing .
'''
scale = len ( seq ) / 10
p = stats . uniform . pdf ( seq )
if dist == - 1 or dist == 0 or dist == 1 :
p = MixtureModel ( [ truncnorm ( min ( seq ) , max ( seq ) , get_loc ( seq , dist ) , scale ) ] ) . pdf ( seq )
elif dist == 2 :
p = MixtureModel ( [
truncnorm ( min ( seq ) , max ( seq ) , get_loc ( seq , - 1 ) , scale ) ,
truncnorm ( min ( seq ) , max ( seq ) , get_loc ( seq , + 1 ) , scale )
] ) . pdf ( seq )
plt . plot ( seq , p )
plt . show ( )
def simplify_data ( seq , lmdks ) :
'''
Get synthetic from real data .
Parameters :
seq - The trajectory .
lmdks - The landmarks .
Returns :
The simplified sequence and landmarks .
'''
seq_s = get_seq ( 1 , len ( seq ) )
lmdks_s = [ ]
for i , p in enumerate ( seq ) :
if is_landmark ( p , lmdks ) :
lmdks_s . append ( i + 1 )
return seq_s , np . array ( lmdks_s )
def truncnorm ( lower , upper , loc , scale ) :
'''
A truncated normal continuous random variable .
Parameters :
lower - The lower bound of the range .
upper - The upper bound of the range .
loc - The location of the distribution .
scale - The spread of the distribution .
Returns :
A truncated normal continuous random variable .
'''
return stats . truncnorm ( ( lower - loc ) / scale , ( upper - loc ) / scale , loc , scale )
def is_valid ( coord ) :
'''
Check if coordinates are valid .
Parameters :
coord - The coordinates to be checked .
Returns :
True - Valid coordinates .
False - Invalid coordinates .
'''
if coord is None or ( float ( coord [ 0 ] ) > 90 or float ( coord [ 0 ] ) < - 90 # lat
or float ( coord [ 1 ] ) > 180 or float ( coord [ 1 ] ) < - 180 ) : #lng
return False
return True
def save_data ( args , data , name ) :
'''
Save to file and compress .
Parameters :
args - The execution arguments .
data - The data .
name - The name .
Returns :
Nothing .
'''
try :
path = os . path . dirname ( os . path . abspath ( args . res ) ) + ' / ' + name
if ' lmdks ' in name :
path + = ' (dis= ' + str ( args . dist ) + ' , per= ' + str ( args . per ) + ' ) '
print ( ' Saving to %s ... ' % ( path ) , end = ' ' , flush = True )
with open ( path , ' wb ' ) as file :
_ , idx = np . unique ( data , axis = 0 , return_index = True )
np . save ( file , data [ np . sort ( idx ) ] , allow_pickle = False )
print ( ' [OK] ' )
# Compress file
print ( ' Compressing in %s ... ' % ( args . res ) , end = ' ' , flush = True )
with zipfile . ZipFile ( args . res , ' a ' ) as zip_file :
zip_file . write ( path , os . path . basename ( path ) )
print ( ' [OK] ' )
# Delete file
print ( ' Deleting %s ... ' % ( path ) , end = ' ' , flush = True )
os . remove ( path )
print ( ' [OK] ' )
except Exception as e :
print ( ' [Error: %s ] ' % ( e ) )
def load_data ( args , name ) :
'''
Load data from file .
Parameters :
args - The execution arguments .
data - The data .
name - The name .
Returns :
data - The data set .
0 : uid , 1 : lat , 2 : lng , 3 : tim
'''
data = np . array ( [ ] )
try :
path = os . path . dirname ( os . path . abspath ( args . res ) ) + ' / ' + name
if ' lmdks ' in name :
path + = ' (dis= ' + str ( args . dist ) + ' , per= ' + str ( args . per ) + ' ) '
print ( ' Loading %s ... ' % ( path ) , end = ' ' , flush = True )
with zipfile . ZipFile ( args . res , ' r ' ) as zip_file :
data = np . load ( zip_file . open ( os . path . basename ( path ) ) )
print ( ' [OK] ' )
except Exception as e :
print ( ' [Error: %s ] ' % ( e ) )
return data
def find_lmdks ( usrs_data , args ) :
'''
Find users ' landmarks.
Parameters :
args - The execution arguments .
usrs_data - The users ' data.
0 : uid , 1 : lat , 2 : lng , 3 : tim
Returns :
usrs_lmdks - The users ' landmarks.
2021-10-06 10:26:28 +02:00
0 : uid , 1 : lat , 2 : lng , 3 : tim
2021-07-25 15:51:51 +02:00
'''
2021-10-06 10:26:28 +02:00
usrs_lmdks = np . empty ( ( 0 , 4 ) , np . float32 )
2021-07-25 15:51:51 +02:00
traj_cur = 0
lmdk_id = 0
usrs = np . unique ( usrs_data [ : , 0 ] )
for usr_i , usr in enumerate ( usrs ) :
# Initialize user's landmarks list
lmdks = [ ]
traj = usrs_data [ usrs_data [ : , 0 ] == usr , : ]
traj_cur + = len ( traj )
print (
' [ %d %% ] Points: %d / %d | Users: %d / %d ... '
% ( ( traj_cur / usrs_data . shape [ 0 ] ) * 100 , traj_cur , usrs_data . shape [ 0 ] , usr , usrs [ len ( usrs ) - 1 ] ) ,
end = ' ' , flush = True
)
# Check the actual points
i = 0
while i < len ( traj ) - 1 :
lmdk_cur = [ ]
if is_valid ( ( traj [ i ] [ 1 ] , traj [ i ] [ 2 ] ) ) :
for j in range ( i + 1 , len ( traj ) ) :
if is_valid ( ( traj [ j ] [ 1 ] , traj [ j ] [ 2 ] ) ) :
# Add the beginning only the first time
if j == i + 1 :
2021-10-06 10:26:28 +02:00
lmdk_cur . append ( [ usr , traj [ i ] [ 1 ] , traj [ i ] [ 2 ] , traj [ i ] [ 3 ] ] )
2021-07-25 15:51:51 +02:00
# Add the new point
2021-10-06 10:26:28 +02:00
lmdk_cur . append ( [ usr , traj [ j ] [ 1 ] , traj [ j ] [ 2 ] , traj [ j ] [ 3 ] ] )
2021-07-25 15:51:51 +02:00
# Distance in meters
dist = distance ( ( traj [ i ] [ 1 ] , traj [ i ] [ 2 ] ) , ( traj [ j ] [ 1 ] , traj [ j ] [ 2 ] ) ) . km * 1000
# Distance exceeded or reached end of iteration
if dist > args . dist or j == len ( traj ) - 1 :
# Period in minutes
per = abs ( datetime . fromtimestamp ( int ( traj [ i ] [ 3 ] ) ) - datetime . fromtimestamp ( int ( traj [ j ] [ 3 ] ) ) ) . total_seconds ( ) / 60
# Check if enough time passed
if per > args . per :
# usrs_id starts from 1
lmdk_id + = 1
# Assign id to current landmark
for l in lmdk_cur :
l [ 0 ] = lmdk_id
# Append current landmark
lmdks + = lmdk_cur
# Continue checking from the current point
i = j
break
# No landmark was found, continue from next point
if i == 0 or not is_valid ( ( traj [ i ] [ 1 ] , traj [ i ] [ 2 ] ) ) or ' j ' not in vars ( ) or i != j :
i + = 1
print ( ' [OK] ' )
if lmdks :
usrs_lmdks = np . append ( usrs_lmdks , np . asarray ( lmdks , dtype = np . float32 ) , axis = 0 )
return usrs_lmdks
2021-10-06 01:44:50 +02:00
def find_lmdks_seq ( seq , lmdks ) :
lmdks_seq = [ ]
for i , p in enumerate ( seq ) :
if any ( np . equal ( lmdks , p ) . all ( 1 ) ) :
lmdks_seq . append ( i + 1 )
2021-10-06 10:18:14 +02:00
return np . array ( lmdks_seq , dtype = int )
2021-10-06 01:44:50 +02:00
2021-09-29 02:33:15 +02:00
def find_lmdks_tim ( lmdk_data , seq , uid , pct ) :
2021-09-29 00:12:57 +02:00
'''
Find user ' s landmarks timestamps.
Parameters :
lmdk_data - The landmarks contacts for all users per
landmarks percentage .
0 : uid , 1 : lmdk_pct , 2 : contacts
seq - The users ' data.
0 : uid , 1 : lmdk_pct , 2 : contacts
uid - The user ' s id that we are interested in.
pct - The landmarks percentage .
Returns :
lmdks - The user ' s landmarks timestamps for the given
landmarks percentage .
0 : tim , 1 : uid_a , 2 : uid_b , 3 : rssi
'''
# Initialize user's landmarks
lmdks = np . empty ( 0 )
# All the sequence
if pct == 100 :
# Get all timestamps
return seq [ : , 1 ]
# Find for given percentage
elif pct != 0 :
# All user's landmark contacts for all landmarks percentages
usr_lmdks = lmdk_data [ lmdk_data [ : , 0 ] == uid ]
# User's landmark contacts for given percentage
pct_lmdks = ast . literal_eval ( usr_lmdks [ usr_lmdks [ : , 1 ] == str ( pct / 100 ) ] [ 0 ] [ 2 ] )
# Find landmarks timestamps
for u in pct_lmdks :
lmdks = np . concatenate ( ( lmdks , np . array ( seq [ seq [ : , 2 ] == float ( u ) ] ) [ : , 0 ] ) )
return lmdks
2021-09-29 02:33:15 +02:00
def find_lmdks_cont ( lmdk_data , seq , uid , pct ) :
'''
Find user ' s landmarks contacts.
Parameters :
lmdk_data - The landmarks contacts for all users per
landmarks percentage .
0 : uid , 1 : lmdk_pct , 2 : contacts
seq - The users ' data.
2021-10-07 12:32:46 +02:00
0 : tim , 1 : uid , 2 : cont , 3 : rssi
2021-09-29 02:33:15 +02:00
uid - The user ' s id that we are interested in.
pct - The landmarks percentage .
Returns :
lmdks - The user ' s landmarks contacts for the given
landmarks percentage .
2021-10-07 12:32:46 +02:00
0 : tim , 1 : uid , 2 : cont , 3 : rssi
2021-09-29 02:33:15 +02:00
'''
# Initialize user's landmarks
2021-09-29 12:59:29 +02:00
lmdks = np . empty ( 0 ) . reshape ( 0 , 4 )
2021-09-29 02:33:15 +02:00
# All the sequence
if pct == 100 :
# Get all timestamps
2021-09-29 12:59:29 +02:00
return seq
2021-09-29 02:33:15 +02:00
# Find for given percentage
elif pct != 0 :
# All user's landmark contacts for all landmarks percentages
usr_lmdks = lmdk_data [ lmdk_data [ : , 0 ] == uid ]
2021-09-29 12:59:29 +02:00
conts = ast . literal_eval ( usr_lmdks [ usr_lmdks [ : , 1 ] == str ( pct / 100 ) ] [ 0 ] [ 2 ] )
# Actual landmarks
for c in conts :
lmdks = np . vstack ( [ lmdks , seq [ seq [ : , 2 ] == c ] ] )
2021-09-29 02:33:15 +02:00
return lmdks
2021-07-25 15:51:51 +02:00
def lmdks_stats ( args , usrs_lmdks ) :
'''
Generate landmarks ' stats.
Parameters :
args - The execution arguments .
usrs_lmdks - The user ' landmarks.
0 : lid , 1 : uid , 2 : lat , 3 : lng , 4 : tim
Returns :
Nothing .
'''
lmdks_stats = {
' total ' : 0 ,
' len ' : 0 ,
' count ' : { } ,
}
# Check every user
usrs = np . unique ( usrs_lmdks [ : , 1 ] )
for usr in usrs :
print (
' [ %d %% ] Calculating landmark stats for user %d / %d ... '
% ( usr * 100 / len ( usrs ) , usr , np . max ( usrs ) ) ,
end = ' ' , flush = True
)
# Check each user's landmarks
for lmdk in np . unique ( usrs_lmdks [ usrs_lmdks [ : , 1 ] == usr , 0 ] ) :
lmdk_traj = usrs_lmdks [ usrs_lmdks [ : , 0 ] == lmdk , : ]
lmdks_stats [ ' total ' ] + = 1
lmdks_stats [ ' len ' ] + = len ( lmdk_traj )
if lmdks_stats [ ' count ' ] . get ( len ( lmdk_traj ) ) == None :
lmdks_stats [ ' count ' ] [ len ( lmdk_traj ) ] = 1
else :
lmdks_stats [ ' count ' ] [ len ( lmdk_traj ) ] + = 1
print ( ' [OK] ' )
hist_min , hist_max = min ( lmdks_stats [ ' count ' ] , key = int , default = 0 ) , max ( lmdks_stats [ ' count ' ] , key = int , default = 0 )
# Make histogram
x = [ ]
for i in range ( hist_min , hist_max + 1 ) :
if lmdks_stats [ ' count ' ] . get ( i ) != None :
x . extend ( [ i ] * lmdks_stats [ ' count ' ] . get ( i ) )
# Show stats
print (
' \n ############ Stats ########### \n '
' Landmarks : %d \n '
' Length \n '
' Total : %d ( %.2f %% ) \n '
' Minimum : %d \n '
' Maximum : %d \n '
' ############################## \n '
% ( lmdks_stats [ ' total ' ] , lmdks_stats [ ' len ' ] , ( lmdks_stats [ ' len ' ] / args . time ) * 100 , min ( lmdks_stats [ ' count ' ] , key = int , default = 0 ) , max ( lmdks_stats [ ' count ' ] , key = int , default = 0 ) )
)
# # Initialize plot
# plot_init()
# # Set x axis
# plt.xlabel('Landmarks sequence length')
# plt.xticks(rotation='vertical')
# # The y axis
# plt.ylabel('Number of sequences')
# plt.yscale('log')
# Create histogram
# plt.hist(x, bins=(hist_max - hist_min))
# Show plot
# plt.show()
# Save plot
# save_plot(os.path.dirname(args.arc) + '/' + 'results' + '(dis=' + str(args.dist) + ', per=' + str(args.per) + ').pdf')
def should_sample ( samp_rt ) :
'''
Randomly decide to release with noise .
Parameters :
samp_rt - The sampling rate ( 0 , 1 ]
Returns :
True / False
'''
return random . choices ( population = [ True , False ] , weights = [ samp_rt , 1 - samp_rt ] , k = 1 ) [ 0 ]
def is_landmark ( p , lmdks ) :
'''
Check is a point is a landmark .
Parameters :
p - The point to check .
lmdks - A 2 d array of landmarks .
Returns :
True / False
'''
2021-10-06 10:26:28 +02:00
if len ( lmdks ) and any ( np . equal ( lmdks , p ) . all ( 1 ) ) :
2021-07-25 15:51:51 +02:00
return True
return False