the-last-thing/code/lib/lmdk_lib.py

1094 lines
29 KiB
Python
Raw Permalink Normal View History

2021-07-25 15:51:51 +02:00
#!/usr/bin/env python3
2021-09-29 00:12:57 +02:00
import ast
2021-07-25 15:51:51 +02:00
from datetime import datetime
from geopy.distance import distance
import heapq
import itertools
from scipy.special import lambertw
import math
import numpy as np
import os
import random
import scipy.stats as stats
import time
from matplotlib import pyplot as plt
import zipfile
# Plot globals
dpi = 300
font_size = 24.0
line_width = 2.0
marker_size = 14.0
tick_length = 8.0
def add_polar_noise(loc, epsilon):
'''
Add noise from planar Laplace.
[https://github.com/chatziko/location-guard/blob/master/src/js/common/laplace.js]
Parameters:
2021-09-29 00:54:08 +02:00
loc - The original location.
epsilon - The privacy budget.
2021-07-25 15:51:51 +02:00
Returns:
2021-09-29 00:54:08 +02:00
The perturbed location (lat, lng).
2021-07-25 15:51:51 +02:00
'''
# The bearing in radians.
# Random number in [0, 2*pi),
theta = np.random.uniform(low = 0, high = np.pi*2)
# The distance in meters.
# Use the inverse cumulative polar laplacian distribution function.
r = -np.real((lambertw((np.random.uniform(low = 0, high = 1) - 1)/np.e, k = -1) + 1)/epsilon)
new_loc = None
while not is_valid(new_loc):
new_loc = distance(kilometers = r/1000).destination(point = loc, bearing = np.degrees(theta))
return new_loc
2021-09-29 01:03:50 +02:00
def randomized_response(r, epsilon):
'''
Return the original response with probability p,
or flip the response with probability 1 - p.
[https://www.usenix.org/system/files/conference/usenixsecurity17/sec17-wang-tianhao.pdf]
Parameters:
r - The original response (True/False).
epsilon - The privacy budget.
Returns:
The randomized response.
'''
# Return the original value
if random.uniform(0, 1) <= math.exp(epsilon)/(math.exp(epsilon) + 1):
return r
# Flip the original value
else:
return not r
2021-09-29 19:56:38 +02:00
def add_laplace_noise(value, sens, epsilon):
'''
Add random noise to a data value.
Parameteres:
value - The value to add noise to.
sens - The query function sensitivity.
epsilon - The privacy budget.
Returns:
The value with the noise.
'''
return np.random.laplace(value, sens/epsilon)
2021-07-25 15:51:51 +02:00
def draw_line(line, x, label, marker):
axis_x = list(range(len(x)))
# plt.xticks(axis_x, x)
plt.plot(axis_x, x, label=label, marker=marker)
def eval_seq(dists):
'''
Calculate the standard deviation of a list of distances.
Parameters:
dists - A list of distances.
Returns:
The standard deviation of the distances.
'''
return np.std(dists)
def get_abs_dists(seq, comb, lmdks):
'''
Get the distances of the points in a combination from the start and end of the sequence.
Parameters:
seq - The point sequence.
comb - A possible point combination for a specified size.
lmdks - The landmarks.
Returns:
dists - The distances of the points in a combination.
'''
cur = np.append(comb, lmdks)
cur.sort()
start = seq[0]
end = seq[len(seq) - 1]
dists = np.zeros([len(cur)*2])
# Check if there are any points
if cur.size:
for i in range(0, len(cur)):
# From the start
dists[i*2] = abs(cur[i] - start)
# From the end
dists[i*2 + 1] = abs(cur[i] - end)
return dists
def get_combs(seq, size):
'''
Get all the possible landmark point combinations for a specified size.
Parameters:
seq - The point sequence.
size - The desired number of landmarks.
Returns:
All the possible landmark combinations for a specified size.
'''
return itertools.combinations(seq, size)
def get_combs_h(h, hist, max):
combs = []
for comb in itertools.product(range(h + 1), repeat=len(hist)):
check = True
for i, c in enumerate(comb):
if sum(comb) > max or c < hist[i] or c > h:
check = False
break
if check:
combs.append(comb)
return combs
def get_emd(h1, h2):
'''
The earth mover's distance (EMD), or Wasserstein distance, of two histograms.
[https://stats.stackexchange.com/a/151362]
Pele et al., The quadratic-chi histogram distance family
Parameters:
h1 - A histogram.
h2 - Another histogram.
Return:
The EMD of the histograms.
'''
return stats.wasserstein_distance(h1,h2)
# return stats.wasserstein_distance(get_norm_hist(h1), get_norm_hist(h2))
def get_hist(seq, lmdks):
'''
Create a histogram from a set of landmarks.
Parameters:
seq - The point sequence.
lmdks - The landmarks.
Return:
hist - The histogram of landmarks.
h - The bin size.
'''
# # Create a landmark sequence relative to the entire sequence
# lmdks_rel = np.array(lmdks)
# # Add the start of the sequence if not in landmarks
# if not np.any(lmdks_rel[:] == start):
# lmdks_rel = np.insert(lmdks_rel, 0, start)
# # Add the end of the sequence if not in landmarks
# if not np.any(lmdks_rel[:] == end):
# lmdks_rel = np.append(lmdks_rel, end)
# Dealing with zeros.
2021-10-13 09:03:36 +02:00
if len(seq) == 0:
2021-07-25 15:51:51 +02:00
return np.zeros(math.ceil(max(seq))), 1
2021-10-13 09:03:36 +02:00
elif len(lmdks) == 0:
return np.zeros(1), len(seq)
2021-07-25 15:51:51 +02:00
# Interquartile range (IQR) is a measure of statistical dispersion, being equal to the difference between 75th and 25th percentiles, or between upper and lower quartiles.
# https://en.wikipedia.org/wiki/Interquartile_range
iqr = stats.iqr(lmdks)
# Use the FreedmanDiaconis rule to select the width of the bins to be used in the histogram.
# Robust (resilient to outliers) estimator that takes into account data variability and data size.
# https://en.wikipedia.org/wiki/Freedman%E2%80%93Diaconis_rule
h = 2*iqr*len(lmdks)**(-1/3)
if h < 1:
h = 1
# On the number of bins and width:
# https://en.wikipedia.org/wiki/Histogram#Number_of_bins_and_width
# Normalize the interval
h = math.ceil(max(seq)/math.ceil(max(seq)/math.ceil(h)))
# Create an empty histogram with intervals of size h
2021-10-06 10:18:14 +02:00
hist = np.zeros(math.ceil(max(seq)/h), dtype = int)
2021-07-25 15:51:51 +02:00
for lmdk in lmdks:
hist[int(lmdk/h) - 1] += 1
return hist, h
def get_lmdks(seq, size, dist):
'''
Get a landmark set for a distribution type.
Parameters:
seq - The point sequence.
size - The landmarks' size.
dist - The distribution type code.
-1 - Left-skewed
0 - Symmetric
+1 - Right-skewed
+2 - Bimodal
+3 - Uniform
Return:
hist - The histogram of landmarks.
h - The bin size.
'''
scale = len(seq)/10
if dist == -1 or dist == 0 or dist == 1:
return MixtureModel([truncnorm(min(seq), max(seq), get_loc(seq, dist), scale)]).sample(min(seq), max(seq), [], size)
elif dist == 2:
return MixtureModel([
truncnorm(min(seq), max(seq), get_loc(seq, -1), scale),
truncnorm(min(seq), max(seq), get_loc(seq, +1), scale)
]).sample(min(seq), max(seq), [], size)
else:
return np.sort(np.random.choice(np.arange(min(seq), max(seq) + 1, 1), size=size, replace=False))
def get_lmdks_rand(seq, size, skew):
'''
Get a landmark set with a given skewness.
Parameters:
seq - The point sequence.
size - The landmarks' size.
skew - The desired skewness.
Return:
hist - The histogram of landmarks.
h - The bin size.
'''
# Get all possible landmark combinations
lmdk_combs = get_combs(seq, size)
# Find a set of landmarks for the given requirements
lmdks = ()
for comb in lmdk_combs:
skew_cur = get_skew(comb)
# Check if skewness is close to the requirement or have the same sign
if math.isclose(skew_cur, skew, rel_tol=.1) or ((skew_cur < 0) == (skew < 0)):
lmdks = comb
break
return lmdks
def get_loc(seq, skew):
'''
Get the location of the sequence with a given skewness.
Parameters:
seq - The point sequence.
skew - The desired skewness.
Return:
The location.
'''
loc_min = min(seq)
loc_max = max(seq)
if skew < 0:
loc_min = (max(seq) - min(seq))/2
elif skew > 0:
loc_max = (max(seq) - min(seq))/2
return int(loc_min + (loc_max - loc_min)/2)
def get_mean(seq, hist, h):
'''
Get the mean of the histogram.
Parameters:
seq - The point sequence.
hist - The histogram of landmarks.
h - The bin size.
Return:
The mean of the histogram.
'''
sum = 0
for idx, count in enumerate(hist):
# Find bin limits
start = min(seq) + h*idx
end = min(seq) + h*(idx + 1)
if(end > max(seq)):
end = max(seq)
sum += (start + (end - start)/2)*count
return sum/np.sum(hist)
def get_norm(h1, h2):
'''
The Euclidean distance of two histograms.
[https://stackoverflow.com/a/1401828/13123075]
Parameters:
h1 - A histogram.
h2 - Another histogram.
Return:
The Euclidean distance of the histograms.
'''
return np.linalg.norm(h1 - h2)
def get_norm_hist(hist):
'''
Normalize the histogram.
Parameters:
hist - The original histogram.
Return:
The normalized histogram.
'''
# In-place type conversion
norm_hist = hist.astype(np.float32)
n = np.sum(norm_hist)
for i, a in enumerate(norm_hist):
if a:
norm_hist[i] = a/n
return norm_hist
def get_opts(seq, lmdks):
'''
Finds all the possible valid options.
Parameters:
seq - The point sequence.
lmdks - The landmarks.
Returns:
A list with all the possible options.
Requirements:
n = Regular points
r = The size of a combination
Time - O(C(n, r) + 2^C(n, r)), because O(n*C(n, r)) = O(n*n^min(r, n - r))
Space - O(r*C(n, r))
'''
reg = get_reg(seq, lmdks)
# Find all possible combinations for every k
combs = []
for k in range(len(lmdks) + 1, seq[len(seq) - 1] + 1):
combs.append(get_combs(reg, k - len(lmdks)))
# Find all possible options for all combinations
opts = itertools.product(*combs)
# Keep only valid options, i.e., larger sets must contain every element of every smaller set
rslts = []
for opt in opts:
for i, _ in enumerate(opt):
if i and not set(opt[i - 1]).issubset(opt[i]):
break
if i == len(opt) - 1:
rslts.append(opt)
return rslts
def get_reg(seq, lmdks):
'''
Get the regular points, i.e., non-landmarks, in a sequence.
Parameters:
seq - The point sequence.
lmdks - The landmarks.
Returns:
The non-landmark points in a sequence.
'''
return np.array([i for i in seq if i not in lmdks])
def get_rel_dists(seq, comb, lmdks):
'''
Get the distances of the points in a combination from the nearest point of reference.
That is, the previous/next point or the start/end if closer.
Parameters:
seq - The point sequence.
comb - A possible point combination for a specified size.
lmdks - The landmarks.
Returns:
dists - The distances of the points in a combination.
'''
# TODO: Review distances. Maybe take into account both ways?
cur = np.append(comb, lmdks)
cur.sort()
start = seq[0]
end = seq[len(seq) - 1]
dists = np.zeros([len(cur) + 1])
# Check if there are any points
if cur.size:
# First
dists[0] = abs(cur[0] - start)
# In-between
for i in range(0, len(cur) - 1):
dists[i + 1] = abs(cur[i] - cur[i + 1])
# Last
dists[len(cur)] = abs(cur[len(cur) - 1] - end)
return dists
def get_seq(start, end):
'''
Get a sequence of points.
Parameters:
start - The starting point of the sequence.
end - The ending point of the sequence.
Returns:
A point sequence.
'''
return np.arange(start, end + 1)
def get_shannon_ent(a):
'''
Get the Shannon entropy of an array.
It can be used to calculate the uniformity of a histogram.
Uniform probability yields maximum uncertainty and therefore maximum entropy.
Entropy, then, can only decrease from the value associated with uniform probability.
Parameters:
a - An array of integers
Return:
The Shannon entropy.
'''
# Keep only the non-zero elements of array a because log2(0) is -inf.
# The entropy of an event with zero probability is 0.
a = a[a != 0]
# Histograms are discrete probability distributions.
# We convert the counts to probabilities.
p_a = a/np.sum(a)
# Base 2 gives the unit of bits (or 'shannons').
return -np.sum(p_a*np.log2(p_a))
def get_shannon_ent_norm(a):
'''
Get the normalized Shannon entropy of an array.
It ranges from 0 (high entropy - close to uniform) to 1 (low entropy).
It ranges from 0 (high entropy - close to uniform) to 1 (low entropy).
It ranges from 0 (high entropy - close to uniform) to 1 (low entropy).
Parameters:
a - An array of integers
Return:
The normalized Shannon entropy.
'''
return 1 - get_shannon_ent(a)/np.log2(len(a))
def get_skew(lmdks):
'''
Fisher-Pearson coefficient of skewness.
negative - left (more data items right)
zero - symmetric data
positive - right (more data items left)
positive - right (more data items left)
positive - right (more data items left)
Parameters:
seq - The point sequence.
hist - The histogram of landmarks.
h - The bin size.
Return:
Fisher-Pearson coefficient of skewness.
'''
# def get_skew(seq, hist, h):
# return get_third(seq, hist, h)/get_std(seq, hist, h)**3
return stats.skew(lmdks)
def get_std(seq, hist, h):
'''
Get the standard deviation.
Parameters:
seq - The point sequence.
hist - The histogram of landmarks.
h - The bin size.
Return:
The standard deviation.
'''
hist_mean = get_mean(seq, hist, h)
sum = 0
for idx, count in enumerate(hist):
# Find bin limits
start = min(seq) + h*idx
end = min(seq) + h*(idx + 1)
if(end > max(seq)):
end = max(seq)
sum += (((start + (end - start)/2) - hist_mean)**2)*count
return sum/np.sum(hist)
def get_third(seq, hist, h):
'''
Get the third central moment.
Parameters:
seq - The point sequence.
hist - The histogram of landmarks.
h - The bin size.
Return:
The third central moment.
'''
hist_mean = get_mean(seq, hist, h)
sum = 0
for idx, count in enumerate(hist):
# Find bin limits
start = min(seq) + h*idx
end = min(seq) + h*(idx + 1)
if(end > max(seq)):
end = max(seq)
sum += (((start + (end - start)/2) - hist_mean)**3)*count
return sum/np.sum(hist)
def dist_type_to_str(d):
'''
Convert the distribution type code to string.
Parameters:
t - The distribution type code.
-1 - Left-skewed
0 - Symmetric
+1 - Right-skewed
+2 - Bimodal
+3 - Uniform
Return:
The distribution type.
'''
if d == -1:
return 'Left-skewed'
elif d == 0:
return 'Symmetric'
elif d == 1:
return 'Right-skewed'
elif d == 2:
return 'Bimodal'
elif d == 3:
return 'Uniform'
else:
return 'Undefined'
class MixtureModel(stats.rv_continuous):
'''
A mixture model of continuous probability distributions
[https://stackoverflow.com/a/47763145/13123075]
'''
def __init__(self, submodels, *args, **kwargs):
super().__init__(*args, **kwargs)
self.submodels = submodels
def _pdf(self, x):
pdf = self.submodels[0].pdf(x)
for submodel in self.submodels[1:]:
pdf += submodel.pdf(x)
pdf /= len(self.submodels)
return pdf
def rvs(self, size):
submodel_choices = np.random.randint(len(self.submodels), size=size)
submodel_samples = [submodel.rvs(size=size) for submodel in self.submodels]
rvs = np.choose(submodel_choices, submodel_samples)
return rvs
def sample(self, lower, upper, sample, size):
'''
Sample from the mixture model without replacement.
[https://stackoverflow.com/a/20548895/13123075]
Parameters:
self - The mixture model.
lower - The lower bound of the range.
upper - The upper bound of the range.
sample - Items to be excluded.
size - The sample size.
Returns:
A sample.
'''
w = self.pdf(range(int(lower), int(upper) + 1))
idx = []
for i in range(int(lower), int(upper) + 1):
if i not in sample:
idx.append(i)
elt = [(math.log(random.random()) / w[i - 1], i) for i in idx]
return np.sort([x[1] for x in heapq.nlargest(size, elt)])
def plot_init():
'''
Initialize the plot.
'''
# Reset
plt.close('all')
# Style
plt.style.use('classic')
# DPI
plt.figure(dpi=dpi)
# Font
plt.rc('font', family='sans-serif')
plt.rc('font', **{'sans-serif':['Liberation Sans']})
plt.rc('font', size=font_size)
2022-02-18 00:49:42 +01:00
# Deal with Type 3 fonts
plt.rcParams['pdf.fonttype'] = 42
plt.rcParams['ps.fonttype'] = 42
2021-07-25 15:51:51 +02:00
# Grid
plt.setp(plt.figure().add_subplot(111).spines.values(), linewidth=line_width)
plt.grid(True, axis='y', linewidth=line_width)
# Ticks
plt.gca().tick_params(which='both', width=line_width)
plt.gca().tick_params(which='major', length=tick_length)
plt.gca().tick_params(which='minor', length=tick_length/2)
# Colors
plt.subplot(111).set_prop_cycle('color', ['#212121', '#616161', '#9e9e9e', '#bdbdbd', '#e0e0e0', 'f5f5f5'])
# Layout
plt.tight_layout()
def plot_legend():
'''
Initialize the plot legend.
'''
plt.legend(
loc='best',
fontsize=font_size,
numpoints=1,
borderpad=.2, # default: 0.4
labelspacing=.25, # default: 0.5
handlelength=2.0, # default: 2.0
handletextpad=.4, # default: 0.8
borderaxespad=.5, # default: 0.5
)
def save_plot(path):
'''
Save the plot to a file.
Parameters:
path - The desired location.
Returns:
Nothing.
'''
# Save plot
os.makedirs(os.path.dirname(path), exist_ok=True)
plt.savefig(path, bbox_inches='tight')
# Clean up
plt.clf()
plt.close()
def print_lmdks(seq, lmdks):
'''
Print the landmarks.
Parameters:
seq - The point sequence.
lmdks - The landmarks.
Returns:
Nothing.
'''
print('\n######### Landmarks ##########')
print(lmdks)
print('[', end='', flush=True)
for p in seq:
if np.any(lmdks[:] == p):
print('\'', end='', flush=True)
else:
print('.', end='', flush=True)
print(']', end='', flush=True)
print('\n##############################\n')
def plot_dist(seq, dist):
'''
Plot the probability distribution.
Parameters:
seq - The point sequence.
size - The landmarks' size.
dist - The distribution type code.
-1 - Left-skewed
0 - Symmetric
+1 - Right-skewed
+2 - Bimodal
+3 - Uniform
Returns:
Nothing.
'''
scale = len(seq)/10
p = stats.uniform.pdf(seq)
if dist == -1 or dist == 0 or dist == 1:
p = MixtureModel([truncnorm(min(seq), max(seq), get_loc(seq, dist), scale)]).pdf(seq)
elif dist == 2:
p = MixtureModel([
truncnorm(min(seq), max(seq), get_loc(seq, -1), scale),
truncnorm(min(seq), max(seq), get_loc(seq, +1), scale)
]).pdf(seq)
plt.plot(seq, p)
plt.show()
def simplify_data(seq, lmdks):
'''
Get synthetic from real data.
Parameters:
seq - The trajectory.
lmdks - The landmarks.
Returns:
The simplified sequence and landmarks.
'''
seq_s = get_seq(1, len(seq))
lmdks_s = []
for i, p in enumerate(seq):
if is_landmark(p, lmdks):
lmdks_s.append(i + 1)
return seq_s, np.array(lmdks_s)
def truncnorm(lower, upper, loc, scale):
'''
A truncated normal continuous random variable.
Parameters:
lower - The lower bound of the range.
upper - The upper bound of the range.
loc - The location of the distribution.
scale - The spread of the distribution.
Returns:
A truncated normal continuous random variable.
'''
return stats.truncnorm((lower - loc)/scale, (upper - loc)/scale, loc, scale)
def is_valid(coord):
'''
Check if coordinates are valid.
Parameters:
coord - The coordinates to be checked.
Returns:
True - Valid coordinates.
False - Invalid coordinates.
'''
if coord is None or (float(coord[0]) > 90 or float(coord[0]) < -90 # lat
or float(coord[1]) > 180 or float(coord[1]) < -180): #lng
return False
return True
def save_data(args, data, name):
'''
Save to file and compress.
Parameters:
args - The execution arguments.
data - The data.
name - The name.
Returns:
Nothing.
'''
try:
path = os.path.dirname(os.path.abspath(args.res)) + '/' + name
if 'lmdks' in name:
path += '(dis=' + str(args.dist) + ', per=' + str(args.per) + ')'
print('Saving to %s... ' %(path), end='', flush=True)
with open(path, 'wb') as file:
_, idx = np.unique(data, axis=0, return_index=True)
np.save(file, data[np.sort(idx)], allow_pickle=False)
print('[OK]')
# Compress file
print('Compressing in %s... ' %(args.res), end='', flush=True)
with zipfile.ZipFile(args.res, 'a') as zip_file:
zip_file.write(path, os.path.basename(path))
print('[OK]')
# Delete file
print('Deleting %s... ' %(path), end='', flush=True)
os.remove(path)
print('[OK]')
except Exception as e:
print('[Error: %s]' %(e))
def load_data(args, name):
'''
Load data from file.
Parameters:
args - The execution arguments.
data - The data.
name - The name.
Returns:
data - The data set.
0: uid, 1: lat, 2: lng, 3: tim
'''
data = np.array([])
try:
path = os.path.dirname(os.path.abspath(args.res)) + '/' + name
if 'lmdks' in name:
path += '(dis=' + str(args.dist) + ', per=' + str(args.per) + ')'
print('Loading %s... ' %(path), end='', flush=True)
with zipfile.ZipFile(args.res, 'r') as zip_file:
data = np.load(zip_file.open(os.path.basename(path)))
print('[OK]')
except Exception as e:
print('[Error: %s]' %(e))
return data
def find_lmdks(usrs_data, args):
'''
Find users' landmarks.
Parameters:
args - The execution arguments.
usrs_data - The users' data.
0: uid, 1: lat, 2: lng, 3: tim
Returns:
usrs_lmdks - The users' landmarks.
2021-10-06 10:26:28 +02:00
0: uid, 1: lat, 2: lng, 3: tim
2021-07-25 15:51:51 +02:00
'''
2021-10-06 10:26:28 +02:00
usrs_lmdks = np.empty((0,4), np.float32)
2021-07-25 15:51:51 +02:00
traj_cur = 0
usrs = np.unique(usrs_data[:,0])
for usr_i, usr in enumerate(usrs):
# Initialize user's landmarks list
lmdks = []
traj = usrs_data[usrs_data[:,0]==usr, :]
traj_cur += len(traj)
print(
'[%d%%] Points: %d/%d | Users: %d/%d... '
%((traj_cur/usrs_data.shape[0])*100, traj_cur, usrs_data.shape[0], usr, usrs[len(usrs) - 1]),
end='', flush=True
)
# Check the actual points
i = 0
while i < len(traj) - 1:
lmdk_cur = []
if is_valid((traj[i][1], traj[i][2])):
for j in range(i + 1, len(traj)):
if is_valid((traj[j][1], traj[j][2])):
# Add the beginning only the first time
if j == i + 1:
2021-10-06 10:26:28 +02:00
lmdk_cur.append([usr, traj[i][1], traj[i][2], traj[i][3]])
2021-07-25 15:51:51 +02:00
# Add the new point
2021-10-06 10:26:28 +02:00
lmdk_cur.append([usr, traj[j][1], traj[j][2], traj[j][3]])
2021-07-25 15:51:51 +02:00
# Distance in meters
dist = distance((traj[i][1], traj[i][2]), (traj[j][1], traj[j][2])).km*1000
# Distance exceeded or reached end of iteration
if dist > args.dist or j == len(traj) - 1:
# Period in minutes
per = abs(datetime.fromtimestamp(int(traj[i][3])) - datetime.fromtimestamp(int(traj[j][3]))).total_seconds()/60
# Check if enough time passed
if per > args.per:
# Append current landmark
lmdks += lmdk_cur
# Continue checking from the current point
i = j
break
# No landmark was found, continue from next point
if i == 0 or not is_valid((traj[i][1], traj[i][2])) or 'j' not in vars() or i != j:
i += 1
print('[OK]')
if lmdks:
usrs_lmdks = np.append(usrs_lmdks, np.asarray(lmdks, dtype=np.float32), axis=0)
return usrs_lmdks
2021-10-06 01:44:50 +02:00
def find_lmdks_seq(seq, lmdks):
lmdks_seq = []
for i, p in enumerate(seq):
2021-10-08 20:14:21 +02:00
if is_landmark(p, lmdks):
2021-10-06 01:44:50 +02:00
lmdks_seq.append(i + 1)
2021-10-06 10:18:14 +02:00
return np.array(lmdks_seq, dtype = int)
2021-10-06 01:44:50 +02:00
def find_lmdks_tim(lmdk_data, seq, uid, pct):
2021-09-29 00:12:57 +02:00
'''
Find user's landmarks timestamps.
Parameters:
lmdk_data - The landmarks contacts for all users per
landmarks percentage.
0: uid, 1: lmdk_pct, 2: contacts
seq - The users' data.
0: uid, 1: lmdk_pct, 2: contacts
uid - The user's id that we are interested in.
pct - The landmarks percentage.
Returns:
lmdks - The user's landmarks timestamps for the given
landmarks percentage.
0: tim, 1: uid_a, 2: uid_b, 3: rssi
'''
# Initialize user's landmarks
lmdks = np.empty(0)
# All the sequence
if pct == 100:
# Get all timestamps
return seq[:, 1]
# Find for given percentage
elif pct != 0:
# All user's landmark contacts for all landmarks percentages
usr_lmdks = lmdk_data[lmdk_data[:, 0] == uid]
# User's landmark contacts for given percentage
pct_lmdks = ast.literal_eval(usr_lmdks[usr_lmdks[:, 1] == str(pct/100)][0][2])
# Find landmarks timestamps
for u in pct_lmdks:
lmdks = np.concatenate((lmdks, np.array(seq[seq[:, 2] == float(u)])[:, 0]))
return lmdks
def find_lmdks_cont(lmdk_data, seq, uid, pct):
'''
Find user's landmarks contacts.
Parameters:
lmdk_data - The landmarks contacts for all users per
landmarks percentage.
0: uid, 1: lmdk_pct, 2: contacts
seq - The users' data.
2021-10-07 12:32:46 +02:00
0: tim, 1: uid, 2: cont, 3: rssi
uid - The user's id that we are interested in.
pct - The landmarks percentage.
Returns:
lmdks - The user's landmarks contacts for the given
landmarks percentage.
2021-10-07 12:32:46 +02:00
0: tim, 1: uid, 2: cont, 3: rssi
'''
# Initialize user's landmarks
2021-09-29 12:59:29 +02:00
lmdks = np.empty(0).reshape(0,4)
# All the sequence
if pct == 100:
# Get all timestamps
2021-09-29 12:59:29 +02:00
return seq
# Find for given percentage
elif pct != 0:
# All user's landmark contacts for all landmarks percentages
usr_lmdks = lmdk_data[lmdk_data[:, 0] == uid]
2021-09-29 12:59:29 +02:00
conts = ast.literal_eval(usr_lmdks[usr_lmdks[:, 1] == str(pct/100)][0][2])
# Actual landmarks
for c in conts:
lmdks = np.vstack([lmdks, seq[seq[:, 2] == c]])
return lmdks
2021-07-25 15:51:51 +02:00
def lmdks_stats(args, usrs_lmdks):
'''
Generate landmarks' stats.
Parameters:
args - The execution arguments.
usrs_lmdks - The user' landmarks.
0: lid, 1: uid, 2: lat, 3: lng, 4: tim
Returns:
Nothing.
'''
lmdks_stats = {
'total': 0,
'len': 0,
'count': {},
}
# Check every user
usrs = np.unique(usrs_lmdks[:, 1])
for usr in usrs:
print(
'[%d%%] Calculating landmark stats for user %d/%d... '
%(usr*100/len(usrs), usr, np.max(usrs)),
end='', flush=True
)
# Check each user's landmarks
for lmdk in np.unique(usrs_lmdks[usrs_lmdks[:, 1] == usr, 0]):
lmdk_traj = usrs_lmdks[usrs_lmdks[:, 0] == lmdk, :]
lmdks_stats['total'] += 1
lmdks_stats['len'] += len(lmdk_traj)
if lmdks_stats['count'].get(len(lmdk_traj)) == None:
lmdks_stats['count'][len(lmdk_traj)] = 1
else:
lmdks_stats['count'][len(lmdk_traj)] += 1
print('[OK]')
hist_min, hist_max = min(lmdks_stats['count'], key=int, default=0), max(lmdks_stats['count'], key=int, default=0)
# Make histogram
x = []
for i in range(hist_min, hist_max + 1):
if lmdks_stats['count'].get(i) != None:
x.extend([i]*lmdks_stats['count'].get(i))
# Show stats
print(
'\n############ Stats ###########\n'
'Landmarks : %d\n'
' Length\n'
' Total : %d (%.2f%%)\n'
' Minimum : %d\n'
' Maximum : %d\n'
'##############################\n'
%(lmdks_stats['total'], lmdks_stats['len'], (lmdks_stats['len']/args.time)*100, min(lmdks_stats['count'], key=int, default=0), max(lmdks_stats['count'], key=int, default=0))
)
# # Initialize plot
# plot_init()
# # Set x axis
# plt.xlabel('Landmarks sequence length')
# plt.xticks(rotation='vertical')
# # The y axis
# plt.ylabel('Number of sequences')
# plt.yscale('log')
# Create histogram
# plt.hist(x, bins=(hist_max - hist_min))
# Show plot
# plt.show()
# Save plot
# save_plot(os.path.dirname(args.arc) + '/' + 'results' + '(dis=' + str(args.dist) + ', per=' + str(args.per) + ').pdf')
def should_sample(samp_rt):
'''
Randomly decide to release with noise.
Parameters:
samp_rt - The sampling rate (0, 1]
Returns:
True/False
'''
return random.choices(population=[True, False], weights=[samp_rt, 1 - samp_rt], k=1)[0]
def is_landmark(p, lmdks):
'''
Check is a point is a landmark.
Parameters:
p - The point to check.
lmdks - A 2d array of landmarks.
Returns:
True/False
'''
2021-10-08 20:14:21 +02:00
if len(lmdks) > 0 and any(np.equal(lmdks, p).all(1)):
2021-07-25 15:51:51 +02:00
return True
return False
2022-01-07 06:17:37 +01:00
if __name__ == '__main__':
try:
seq = [1, 2, 3, 4, 5, 6, 7, 8]
lmdk = [1, 3, 5, 8]
print(get_hist(seq, lmdk))
start_time = time.process_time()
# main(args)
end_time = time.process_time()
print('##############################\n'
'Time : %.4fs\n'
'##############################\n'
%(end_time - start_time), flush=True)
except KeyboardInterrupt:
print('Interrupted by user.')
exit()