From 7dc347cf53899aa20b91e4bed0cc2e8071bdbc9a Mon Sep 17 00:00:00 2001 From: Manos Katsomallos Date: Sun, 25 Jul 2021 16:51:51 +0300 Subject: [PATCH] code: Added libraries --- code/lib/lmdk_bgt.py | 479 +++++++++++++++++++++ code/lib/lmdk_lib.py | 967 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 1446 insertions(+) create mode 100644 code/lib/lmdk_bgt.py create mode 100644 code/lib/lmdk_lib.py diff --git a/code/lib/lmdk_bgt.py b/code/lib/lmdk_bgt.py new file mode 100644 index 0000000..044988c --- /dev/null +++ b/code/lib/lmdk_bgt.py @@ -0,0 +1,479 @@ +#!/usr/bin/env python3 + +from geopy.distance import distance +import math +import numpy as np +import random +import sympy as sp +import time +import lmdk_lib +from matplotlib import pyplot as plt + + +def draw_bgts(title, bgts): + ''' + Plots the allocated budget. + + Parameters: + title - The title of the plot. + bgts - The allocated privacy budget. + Returns: + Nothing. + ''' + lmdk_lib.plot_init() + p = np.arange(1, len(bgts) + 1, 1) + plt.plot( + p, + bgts, + linewidth=lmdk_lib.line_width, + markersize=lmdk_lib.marker_size, + markeredgewidth=0, + label=r'$\varepsilon$', + marker='s' + ) + lmdk_lib.plot_legend() + # Set plot box + plt.axis([1, len(bgts), .0, max(bgts) + max(bgts)/2]) + # Set x axis label + plt.xlabel('Timestamp') + # Set y axis label + plt.ylabel('Privacy budget') + plt.title(title.replace('_', '-')) + plt.show() + + +def validate_bgts(seq, lmdks, epsilon, bgts): + ''' + Budget allocation validation. + + Parameters: + seq - The point sequence. + lmdks - The landmarks. + epsilon - The available privacy budget. + bgts - The privacy budget allocation. + Returns: + The index of the privacy budget that caused the failure or -1 if successful. + ''' + bgts_sum = .0 + # Landmarks + for i, p in enumerate(seq): + if np.any(lmdks[:] == p): + bgts_sum += bgts[i] + # Regular events + bgts_max = bgts_sum + for i, p in enumerate(seq): + if not np.any(lmdks[:] == p): + bgts_cur = bgts_sum + bgts[i] + if bgts_cur > bgts_max: + bgts_max = bgts_cur + if bgts_cur > epsilon and not math.isclose(bgts_cur, epsilon, rel_tol=.001): + print('Budget allocation validation failed: %.2f%% (%.4f/%.4f)' %(100*bgts_max/epsilon, bgts_max, epsilon)) + return i + print( + 'Landmark budget allocation: %.2f%% (%.4f/%.4f)\n' + 'Overall budget allocation : %.2f%% (%.4f/%.4f)\n' + %(100*bgts_max/epsilon, bgts_max, epsilon, + 100*sum(bgts)/epsilon, sum(bgts), epsilon,)) + return -1 + + +def uniform(seq, lmdks, epsilon): + ''' + Uniform budget allocation. + + Parameters: + seq - The point sequence. + lmdks - The landmarks. + epsilon - The available privacy budget. + Returns: + bgts - The privacy budget allocation. + ''' + bgts = np.zeros(len(seq)) + # Allocate enough budget for all landmarks and one regular point + k = len(lmdks) + 1 + # All points are landmarks + if k > len(seq): + k = len(lmdks) + # Allocate the budget + for i, _ in enumerate(bgts): + bgts[i] = epsilon/k + return bgts + + +def geometric(seq, lmdks, epsilon): + ''' + Geometric budget allocation. + + Parameters: + seq - The point sequence. + lmdks - The landmarks. + epsilon - The available privacy budget. + Returns: + bgts - The privacy budget distribution. + ''' + bgts = np.zeros([len(seq)]) + epsilon_avail = epsilon + # Landmarks + for i, p in enumerate(seq): + if np.any(lmdks[:] == p): + bgts[i] = epsilon_avail/2 + epsilon_avail = epsilon_avail - epsilon_avail/2 + # Regular events + for i, p in enumerate(seq): + if not np.any(lmdks[:] == p): + bgts[i] = epsilon_avail + return bgts + + +def exponential(seq, lmdks, epsilon): + ''' + Exponential uniform budget allocation (max to user-level). + + Parameters: + seq - The point sequence. + lmdks - The landmarks. + epsilon - The available privacy budget. + Returns: + bgts - The privacy budget allocation. + ''' + # Fallback to uniform if zero or max landmarks + if len(lmdks) == 0 or len(lmdks) == len(seq): + return uniform(seq, lmdks, epsilon) + # # In case of seq == lmdks + # l = 0 + # N = epsilon/len(seq) + # Otherwise + bgts = np.zeros([len(seq)]) + if len(seq) != len(lmdks): + # Find worst case regural point + p_sel = 0 + for p in seq: + if not np.any(lmdks[:] == p): + p_sel = p + break + # List all landmark timestamps with the worst regular + points = np.append(lmdks, [p_sel]) + points = np.sort(points) + # epsilon_t = N*e^-l*t + l = sp.symbols('l', real=True) + # Cumulative allocation at landmarks and one extra point, i.e., L union {t} + T = seq.max() + # Bounding the privacy budgets at L union {t} + # epsilon = sum(N*e^-l*t) for t in L union {t} + t = sp.symbols('t') + eq = 0 + for t in points: + eq += (((epsilon/len(seq))/(sp.exp(-1*l*T)))*sp.exp(-1*l*t)) + try: + l = sp.solve(eq - epsilon, simplify=False, rational=False)[0] + # l = sp.solveset(eq <= epsilon, l, sp.S.Reals).sup + except Exception: + return bgts + # Allocate to the last point T epsilon/len(seq), i.e., user-level + N = (epsilon/len(seq))/sp.exp(-1*l*T) + # Allocate the budget + for i, t in enumerate(seq): + bgts[i] = N*sp.exp(-1*l*t) + return bgts + + +def linear_zero(seq, lmdks, epsilon): + ''' + Linear uniform budget allocation (max to zero). + + Parameters: + seq - The point sequence. + lmdks - The landmarks. + epsilon - The available privacy budget. + Returns: + bgts - The privacy budget allocation. + ''' + bgts = np.zeros([len(seq)]) + # Find worst case regural point + p_sel = 0 + for p in seq: + if not np.any(lmdks[:] == p): + p_sel = p + break + # Sum all landmark timestamps with the worst regular + sum_cur = p_sel + for p in lmdks: + sum_cur += p + # epsilon_t = a*t + b + b = sp.symbols('b') + # Cumulative allocation at landmarks and one extra point + T = seq[len(seq) - 1] + b = sp.solve(b - ((b/T)*sum_cur + epsilon)/(len(lmdks) + 1))[0] + # Allocate to the last point 0 + a = -b/T + # Allocate the budget + for i, t in enumerate(seq): + bgts[i] = a*t + b + return bgts + + +def linear(seq, lmdks, epsilon): + ''' + Linear uniform budget allocation (max to user-level). + + Parameters: + seq - The point sequence. + lmdks - The landmarks. + epsilon - The available privacy budget. + Returns: + bgts - The privacy budget allocation. + ''' + # Fallback to uniform if zero or max landmarks + if len(lmdks) == 0 or len(lmdks) == len(seq): + return uniform(seq, lmdks, epsilon) + # Find worst case regural point + p_sel = 0 + for p in seq: + if not np.any(lmdks[:] == p): + p_sel = p + break + # Sum all landmark timestamps with the worst regular + sum_cur = p_sel + np.sum(lmdks) + # epsilon_t = a*t + b + b = sp.symbols('b', real=True) + # Cumulative allocation at landmarks and one extra point, i.e., L union {t} + T = seq.max() + # Number of points to take into account + k = len(lmdks) + 1 + # if len(lmdks) == len(seq): + # k = len(lmdks) + # Bounding the privacy budgets at L union {t} + # epsilon = a*sum(L union {t}) + |L union {t}|*b + b = sp.solve(b + (((epsilon/len(seq) - b)/T)*sum_cur - epsilon)/k, simplify=False, rational=False)[0] + # Allocate to the last point T epsilon/len(seq), i.e., user-level + a = (epsilon/len(seq) - b)/T + # Allocate the budget + bgts = np.zeros([len(seq)]) + for i, t in enumerate(seq): + bgts[i] = a*t + b + return bgts + + +def plot_line(x_i, arr, label, marker, line): + plt.plot(x_i, + arr, + label=label, + color='black', + marker=marker, + linestyle=line, + # linewidth=1.0, + markerfacecolor='none') + + +def stepped(seq, lmdks, epsilon): + ''' + Stepped budget allocation. + + Parameters: + seq - The point sequence. + lmdks - The landmarks. + epsilon - The available privacy budget. + Returns: + bgts - The privacy budget allocation. + ''' + bgts = np.zeros([len(seq)]) + epsilon_avail = epsilon/2 + for i, p in enumerate(seq): + # Reduce the available budget when a landmark is reached + if np.any(lmdks[:] == p): + epsilon_avail = epsilon_avail/2 + bgts[i] = epsilon_avail + return bgts + + +def adaptive(seq, lmdks, epsilon): + ''' + Adaptive budget allocation. + + Parameters: + seq - The point sequence. + lmdks - The landmarks. + epsilon - The available privacy budget. + Returns: + rls_data - The perturbed data. + bgts - The privacy budget allocation. + skipped - The number of skipped releases. + ''' + # Uniform budget allocation + bgts = uniform(seq, lmdks, epsilon) + # Released + rls_data = [None]*len(seq) + # The sampling rate + samp_rt = 1 + # Track landmarks + lmdk_cur = 0 + # Track skipped releases + skipped = 0 + for i, p in enumerate(seq): + # Check if current point is a landmark + if lmdk_lib.is_landmark(p, lmdks): + lmdk_cur += 1 + # Get coordinates + loc = (p[1], p[2]) + if lmdk_lib.should_sample(samp_rt) or i == 0: + # Add noise to original data + new_loc = lmdk_lib.add_polar_noise(loc, bgts[i]) + rls_data[i] = [p[0], new_loc[0], new_loc[1], p[3]] + # Adjust sampling rate + if i > 0: + if distance((rls_data[i - 1][1], rls_data[i - 1][2]), new_loc).km*1000 < 1/bgts[i]: + # Decrease + # samp_rt -= samp_rt*.9 + # samp_rt -= samp_rt*.75 + samp_rt -= samp_rt*.5 + # samp_rt -= samp_rt*.25 + # samp_rt -= samp_rt*.1 + else: + # Increase + # samp_rt += (1 - samp_rt)*.9 + # samp_rt += (1 - samp_rt)*.75 + samp_rt += (1 - samp_rt)*.5 + # samp_rt += (1 - samp_rt)*.25 + # samp_rt += (1 - samp_rt)*.1 + else: + skipped += 1 + # Skip current release and approximate with previous + rls_data[i] = rls_data[i - 1] + if lmdk_lib.is_landmark(p, lmdks): + # Allocate the current budget to the following releases uniformly + for j in range(i + 1, len(seq)): + bgts[j] += bgts[i]/(len(lmdks) - lmdk_cur + 1) + # No budget was spent + bgts[i] = 0 + return rls_data, bgts, skipped + + +def skip(seq, lmdks, epsilon): + ''' + Skip landmarks. + + Parameters: + seq - The point sequence. + lmdks - The landmarks. + epsilon - The available privacy budget. + Returns: + rls_data - The perturbed data. + bgts - The privacy budget allocation. + ''' + # Event-level budget allocation + bgts = np.array(len(seq)*[epsilon]) + # Released + rls_data = [None]*len(seq) + for i, p in enumerate(seq): + # Get coordinates + loc = (p[1], p[2]) + # Add noise to original data + new_loc = lmdk_lib.add_polar_noise(loc, bgts[i]) + # Check if current point is a landmark + if lmdk_lib.is_landmark(p, lmdks): + if i > 0: + # Approximate with previous location + new_loc = (rls_data[i - 1][1], rls_data[i - 1][2]) + bgts[i] = 0 + rls_data[i] = [p[0], new_loc[0], new_loc[1], p[3]] + return rls_data, bgts + + +def sample(seq, lmdks, epsilon): + ''' + Publish randomly. + + Parameters: + seq - The point sequence. + lmdks - The landmarks. + epsilon - The available privacy budget. + Returns: + rls_data - The perturbed data. + bgts - The privacy budget allocation. + skipped - The number of skipped releases. + ''' + # Uniform budget allocation + bgts = uniform(seq, lmdks, epsilon) + # Released + rls_data = [None]*len(seq) + # The sampling rate + # (publish with 50% chance) + samp_rt = .5 + # Track landmarks + lmdk_cur = 0 + # Track skipped releases + skipped = 0 + for i, p in enumerate(seq): + # Check if current point is a landmark + if lmdk_lib.is_landmark(p, lmdks): + lmdk_cur += 1 + # Get coordinates + loc = (p[1], p[2]) + if i == 0 or lmdk_lib.should_sample(samp_rt): + # Add noise to original data + new_loc = lmdk_lib.add_polar_noise(loc, bgts[i]) + rls_data[i] = [p[0], new_loc[0], new_loc[1], p[3]] + else: + skipped += 1 + # Skip current release and approximate with previous + rls_data[i] = rls_data[i - 1] + if lmdk_lib.is_landmark(p, lmdks): + # Allocate the current budget to the following releases uniformly + for j in range(i + 1, len(seq)): + bgts[j] += bgts[i]/(len(lmdks) - lmdk_cur + 1) + # No budget was spent + bgts[i] = 0 + return rls_data, bgts, skipped + + +def uniform_r(seq, lmdks, epsilon): + # Released + rls_data = [None]*len(seq) + # Budgets + bgts = uniform(seq, lmdks, epsilon) + for i, p in enumerate(seq): + # Get coordinates + loc = (p[1], p[2]) + # Add noise to original data + new_loc = lmdk_lib.add_polar_noise(loc, bgts[i]) + rls_data[i] = [p[0], new_loc[0], new_loc[1], p[3]] + return rls_data, bgts + + +def utility_analysis(seq, lmdks, o, epsilon): + ''' + Analyze the utility. + + Parameters: + seq - The point sequence. + lmdks - The landmarks. + o - The perturbed data. + epsilon - The available privacy budget. + Returns: + Nothing. + ''' + # Estimate Mean Absolute Error + mae = 0 + # Compare with uniform + mae_u = 0 + bgts_u = uniform(seq, lmdks, epsilon) + for i, p in enumerate(seq): + mae += distance((p[1], p[2]), (o[i][1], o[i][2])).km*1000/len(seq) + new_loc = lmdk_lib.add_polar_noise((p[1], p[2]), bgts_u[i]) + mae_u += distance((p[1], p[2]), (new_loc[0], new_loc[1])).km*1000/len(seq) + + print( + '\n########## Analysis ##########\n' + 'MAE uniform : %.2fm\n' + 'MAE current : %.2fm\n' + 'Difference : %.2f%%\n' + '##############################\n' + %(mae_u, mae, 100*(mae - mae_u)/mae_u) + ) + + +def mae(seq, o): + mae = 0 + for i, p in enumerate(seq): + mae += distance((p[1], p[2]), (o[i][1], o[i][2])).km*1000/len(seq) + return mae diff --git a/code/lib/lmdk_lib.py b/code/lib/lmdk_lib.py new file mode 100644 index 0000000..bc22c25 --- /dev/null +++ b/code/lib/lmdk_lib.py @@ -0,0 +1,967 @@ +#!/usr/bin/env python3 + +from datetime import datetime +from geopy.distance import distance +import heapq +import itertools +from scipy.special import lambertw +import math +import numpy as np +import os +import random +import scipy.stats as stats +import time +from matplotlib import pyplot as plt +import zipfile + + +# Plot globals +dpi = 300 +font_size = 24.0 +line_width = 2.0 +marker_size = 14.0 +tick_length = 8.0 + + + +def add_polar_noise(loc, epsilon): + ''' + Add noise from planar Laplace. + [https://github.com/chatziko/location-guard/blob/master/src/js/common/laplace.js] + + Parameters: + loc - The original location + loc - The original location + loc - The original location + epsilon - The privacy budget + Returns: + The perturbed location (lat, lng) + ''' + # The bearing in radians. + # Random number in [0, 2*pi), + theta = np.random.uniform(low = 0, high = np.pi*2) + # The distance in meters. + # Use the inverse cumulative polar laplacian distribution function. + r = -np.real((lambertw((np.random.uniform(low = 0, high = 1) - 1)/np.e, k = -1) + 1)/epsilon) + new_loc = None + while not is_valid(new_loc): + new_loc = distance(kilometers = r/1000).destination(point = loc, bearing = np.degrees(theta)) + return new_loc + + +def draw_line(line, x, label, marker): + axis_x = list(range(len(x))) + # plt.xticks(axis_x, x) + plt.plot(axis_x, x, label=label, marker=marker) + + +def eval_seq(dists): + ''' + Calculate the standard deviation of a list of distances. + + Parameters: + dists - A list of distances. + Returns: + The standard deviation of the distances. + ''' + return np.std(dists) + + +def get_abs_dists(seq, comb, lmdks): + ''' + Get the distances of the points in a combination from the start and end of the sequence. + + Parameters: + seq - The point sequence. + comb - A possible point combination for a specified size. + lmdks - The landmarks. + Returns: + dists - The distances of the points in a combination. + ''' + cur = np.append(comb, lmdks) + cur.sort() + start = seq[0] + end = seq[len(seq) - 1] + dists = np.zeros([len(cur)*2]) + # Check if there are any points + if cur.size: + for i in range(0, len(cur)): + # From the start + dists[i*2] = abs(cur[i] - start) + # From the end + dists[i*2 + 1] = abs(cur[i] - end) + return dists + + +def get_combs(seq, size): + ''' + Get all the possible landmark point combinations for a specified size. + + Parameters: + seq - The point sequence. + size - The desired number of landmarks. + Returns: + All the possible landmark combinations for a specified size. + ''' + return itertools.combinations(seq, size) + +def get_combs_h(h, hist, max): + combs = [] + for comb in itertools.product(range(h + 1), repeat=len(hist)): + check = True + for i, c in enumerate(comb): + if sum(comb) > max or c < hist[i] or c > h: + check = False + break + if check: + combs.append(comb) + return combs + + +def get_emd(h1, h2): + ''' + The earth mover's distance (EMD), or Wasserstein distance, of two histograms. + [https://stats.stackexchange.com/a/151362] + Pele et al., The quadratic-chi histogram distance family + + Parameters: + h1 - A histogram. + h2 - Another histogram. + Return: + The EMD of the histograms. + ''' + return stats.wasserstein_distance(h1,h2) + # return stats.wasserstein_distance(get_norm_hist(h1), get_norm_hist(h2)) + + +def get_hist(seq, lmdks): + ''' + Create a histogram from a set of landmarks. + + Parameters: + seq - The point sequence. + lmdks - The landmarks. + Return: + hist - The histogram of landmarks. + h - The bin size. + ''' + # # Create a landmark sequence relative to the entire sequence + # lmdks_rel = np.array(lmdks) + # # Add the start of the sequence if not in landmarks + # if not np.any(lmdks_rel[:] == start): + # lmdks_rel = np.insert(lmdks_rel, 0, start) + # # Add the end of the sequence if not in landmarks + # if not np.any(lmdks_rel[:] == end): + # lmdks_rel = np.append(lmdks_rel, end) + + # Dealing with zeros. + if len(seq) == 0 or len(lmdks) == 0: + return np.zeros(math.ceil(max(seq))), 1 + + # Interquartile range (IQR) is a measure of statistical dispersion, being equal to the difference between 75th and 25th percentiles, or between upper and lower quartiles. + # https://en.wikipedia.org/wiki/Interquartile_range + iqr = stats.iqr(lmdks) + + # Use the Freedman–Diaconis rule to select the width of the bins to be used in the histogram. + # Robust (resilient to outliers) estimator that takes into account data variability and data size. + # https://en.wikipedia.org/wiki/Freedman%E2%80%93Diaconis_rule + h = 2*iqr*len(lmdks)**(-1/3) + if h < 1: + h = 1 + # On the number of bins and width: + # https://en.wikipedia.org/wiki/Histogram#Number_of_bins_and_width + + # Normalize the interval + h = math.ceil(max(seq)/math.ceil(max(seq)/math.ceil(h))) + + # Create an empty histogram with intervals of size h + hist = np.zeros(math.ceil(max(seq)/h)) + + for lmdk in lmdks: + hist[int(lmdk/h) - 1] += 1 + + return hist, h + + +def get_lmdks(seq, size, dist): + ''' + Get a landmark set for a distribution type. + + Parameters: + seq - The point sequence. + size - The landmarks' size. + dist - The distribution type code. + -1 - Left-skewed + 0 - Symmetric + +1 - Right-skewed + +2 - Bimodal + +3 - Uniform + Return: + hist - The histogram of landmarks. + h - The bin size. + ''' + scale = len(seq)/10 + if dist == -1 or dist == 0 or dist == 1: + return MixtureModel([truncnorm(min(seq), max(seq), get_loc(seq, dist), scale)]).sample(min(seq), max(seq), [], size) + elif dist == 2: + return MixtureModel([ + truncnorm(min(seq), max(seq), get_loc(seq, -1), scale), + truncnorm(min(seq), max(seq), get_loc(seq, +1), scale) + ]).sample(min(seq), max(seq), [], size) + else: + return np.sort(np.random.choice(np.arange(min(seq), max(seq) + 1, 1), size=size, replace=False)) + + +def get_lmdks_rand(seq, size, skew): + ''' + Get a landmark set with a given skewness. + + Parameters: + seq - The point sequence. + size - The landmarks' size. + skew - The desired skewness. + Return: + hist - The histogram of landmarks. + h - The bin size. + ''' + # Get all possible landmark combinations + lmdk_combs = get_combs(seq, size) + # Find a set of landmarks for the given requirements + lmdks = () + for comb in lmdk_combs: + skew_cur = get_skew(comb) + # Check if skewness is close to the requirement or have the same sign + if math.isclose(skew_cur, skew, rel_tol=.1) or ((skew_cur < 0) == (skew < 0)): + lmdks = comb + break + return lmdks + + +def get_loc(seq, skew): + ''' + Get the location of the sequence with a given skewness. + + Parameters: + seq - The point sequence. + skew - The desired skewness. + Return: + The location. + ''' + loc_min = min(seq) + loc_max = max(seq) + if skew < 0: + loc_min = (max(seq) - min(seq))/2 + elif skew > 0: + loc_max = (max(seq) - min(seq))/2 + return int(loc_min + (loc_max - loc_min)/2) + + +def get_mean(seq, hist, h): + ''' + Get the mean of the histogram. + + Parameters: + seq - The point sequence. + hist - The histogram of landmarks. + h - The bin size. + Return: + The mean of the histogram. + ''' + sum = 0 + for idx, count in enumerate(hist): + # Find bin limits + start = min(seq) + h*idx + end = min(seq) + h*(idx + 1) + if(end > max(seq)): + end = max(seq) + + sum += (start + (end - start)/2)*count + + return sum/np.sum(hist) + + +def get_norm(h1, h2): + ''' + The Euclidean distance of two histograms. + [https://stackoverflow.com/a/1401828/13123075] + + Parameters: + h1 - A histogram. + h2 - Another histogram. + Return: + The Euclidean distance of the histograms. + ''' + return np.linalg.norm(h1 - h2) + + +def get_norm_hist(hist): + ''' + Normalize the histogram. + + Parameters: + hist - The original histogram. + Return: + The normalized histogram. + ''' + # In-place type conversion + norm_hist = hist.astype(np.float32) + n = np.sum(norm_hist) + for i, a in enumerate(norm_hist): + if a: + norm_hist[i] = a/n + return norm_hist + + +def get_opts(seq, lmdks): + ''' + Finds all the possible valid options. + + Parameters: + seq - The point sequence. + lmdks - The landmarks. + Returns: + A list with all the possible options. + + Requirements: + n = Regular points + r = The size of a combination + Time - O(C(n, r) + 2^C(n, r)), because O(n*C(n, r)) = O(n*n^min(r, n - r)) + Space - O(r*C(n, r)) + ''' + reg = get_reg(seq, lmdks) + # Find all possible combinations for every k + combs = [] + for k in range(len(lmdks) + 1, seq[len(seq) - 1] + 1): + combs.append(get_combs(reg, k - len(lmdks))) + + # Find all possible options for all combinations + opts = itertools.product(*combs) + + # Keep only valid options, i.e., larger sets must contain every element of every smaller set + rslts = [] + for opt in opts: + for i, _ in enumerate(opt): + if i and not set(opt[i - 1]).issubset(opt[i]): + break + if i == len(opt) - 1: + rslts.append(opt) + + return rslts + + +def get_reg(seq, lmdks): + ''' + Get the regular points, i.e., non-landmarks, in a sequence. + + Parameters: + seq - The point sequence. + lmdks - The landmarks. + Returns: + The non-landmark points in a sequence. + ''' + return np.array([i for i in seq if i not in lmdks]) + + +def get_rel_dists(seq, comb, lmdks): + ''' + Get the distances of the points in a combination from the nearest point of reference. + That is, the previous/next point or the start/end if closer. + + Parameters: + seq - The point sequence. + comb - A possible point combination for a specified size. + lmdks - The landmarks. + Returns: + dists - The distances of the points in a combination. + ''' + # TODO: Review distances. Maybe take into account both ways? + cur = np.append(comb, lmdks) + cur.sort() + start = seq[0] + end = seq[len(seq) - 1] + dists = np.zeros([len(cur) + 1]) + # Check if there are any points + if cur.size: + # First + dists[0] = abs(cur[0] - start) + # In-between + for i in range(0, len(cur) - 1): + dists[i + 1] = abs(cur[i] - cur[i + 1]) + # Last + dists[len(cur)] = abs(cur[len(cur) - 1] - end) + return dists + + +def get_seq(start, end): + ''' + Get a sequence of points. + + Parameters: + start - The starting point of the sequence. + end - The ending point of the sequence. + Returns: + A point sequence. + ''' + return np.arange(start, end + 1) + + +def get_shannon_ent(a): + ''' + Get the Shannon entropy of an array. + It can be used to calculate the uniformity of a histogram. + Uniform probability yields maximum uncertainty and therefore maximum entropy. + Entropy, then, can only decrease from the value associated with uniform probability. + + Parameters: + a - An array of integers + Return: + The Shannon entropy. + ''' + # Keep only the non-zero elements of array a because log2(0) is -inf. + # The entropy of an event with zero probability is 0. + a = a[a != 0] + # Histograms are discrete probability distributions. + # We convert the counts to probabilities. + p_a = a/np.sum(a) + # Base 2 gives the unit of bits (or 'shannons'). + return -np.sum(p_a*np.log2(p_a)) + + +def get_shannon_ent_norm(a): + ''' + Get the normalized Shannon entropy of an array. + It ranges from 0 (high entropy - close to uniform) to 1 (low entropy). + It ranges from 0 (high entropy - close to uniform) to 1 (low entropy). + It ranges from 0 (high entropy - close to uniform) to 1 (low entropy). + + Parameters: + a - An array of integers + Return: + The normalized Shannon entropy. + ''' + return 1 - get_shannon_ent(a)/np.log2(len(a)) + + +def get_skew(lmdks): + ''' + Fisher-Pearson coefficient of skewness. + negative - left (more data items right) + zero - symmetric data + positive - right (more data items left) + positive - right (more data items left) + positive - right (more data items left) + + Parameters: + seq - The point sequence. + hist - The histogram of landmarks. + h - The bin size. + Return: + Fisher-Pearson coefficient of skewness. + ''' +# def get_skew(seq, hist, h): +# return get_third(seq, hist, h)/get_std(seq, hist, h)**3 + return stats.skew(lmdks) + + +def get_std(seq, hist, h): + ''' + Get the standard deviation. + + Parameters: + seq - The point sequence. + hist - The histogram of landmarks. + h - The bin size. + Return: + The standard deviation. + ''' + hist_mean = get_mean(seq, hist, h) + + sum = 0 + for idx, count in enumerate(hist): + # Find bin limits + start = min(seq) + h*idx + end = min(seq) + h*(idx + 1) + if(end > max(seq)): + end = max(seq) + + sum += (((start + (end - start)/2) - hist_mean)**2)*count + + return sum/np.sum(hist) + + + +def get_third(seq, hist, h): + ''' + Get the third central moment. + + Parameters: + seq - The point sequence. + hist - The histogram of landmarks. + h - The bin size. + Return: + The third central moment. + ''' + hist_mean = get_mean(seq, hist, h) + + sum = 0 + for idx, count in enumerate(hist): + # Find bin limits + start = min(seq) + h*idx + end = min(seq) + h*(idx + 1) + if(end > max(seq)): + end = max(seq) + + sum += (((start + (end - start)/2) - hist_mean)**3)*count + + return sum/np.sum(hist) + + +def dist_type_to_str(d): + ''' + Convert the distribution type code to string. + + Parameters: + t - The distribution type code. + -1 - Left-skewed + 0 - Symmetric + +1 - Right-skewed + +2 - Bimodal + +3 - Uniform + Return: + The distribution type. + ''' + if d == -1: + return 'Left-skewed' + elif d == 0: + return 'Symmetric' + elif d == 1: + return 'Right-skewed' + elif d == 2: + return 'Bimodal' + elif d == 3: + return 'Uniform' + else: + return 'Undefined' + + +class MixtureModel(stats.rv_continuous): + ''' + A mixture model of continuous probability distributions + [https://stackoverflow.com/a/47763145/13123075] + ''' + def __init__(self, submodels, *args, **kwargs): + super().__init__(*args, **kwargs) + self.submodels = submodels + + def _pdf(self, x): + pdf = self.submodels[0].pdf(x) + for submodel in self.submodels[1:]: + pdf += submodel.pdf(x) + pdf /= len(self.submodels) + return pdf + + def rvs(self, size): + submodel_choices = np.random.randint(len(self.submodels), size=size) + submodel_samples = [submodel.rvs(size=size) for submodel in self.submodels] + rvs = np.choose(submodel_choices, submodel_samples) + return rvs + + def sample(self, lower, upper, sample, size): + ''' + Sample from the mixture model without replacement. + [https://stackoverflow.com/a/20548895/13123075] + + Parameters: + self - The mixture model. + lower - The lower bound of the range. + upper - The upper bound of the range. + sample - Items to be excluded. + size - The sample size. + Returns: + A sample. + ''' + w = self.pdf(range(int(lower), int(upper) + 1)) + idx = [] + for i in range(int(lower), int(upper) + 1): + if i not in sample: + idx.append(i) + elt = [(math.log(random.random()) / w[i - 1], i) for i in idx] + return np.sort([x[1] for x in heapq.nlargest(size, elt)]) + + +def plot_init(): + ''' + Initialize the plot. + ''' + # Reset + plt.close('all') + # Style + plt.style.use('classic') + # DPI + plt.figure(dpi=dpi) + # Font + plt.rc('font', family='sans-serif') + plt.rc('font', **{'sans-serif':['Liberation Sans']}) + plt.rc('font', size=font_size) + # Grid + plt.setp(plt.figure().add_subplot(111).spines.values(), linewidth=line_width) + plt.grid(True, axis='y', linewidth=line_width) + # Ticks + plt.gca().tick_params(which='both', width=line_width) + plt.gca().tick_params(which='major', length=tick_length) + plt.gca().tick_params(which='minor', length=tick_length/2) + # Colors + plt.subplot(111).set_prop_cycle('color', ['#212121', '#616161', '#9e9e9e', '#bdbdbd', '#e0e0e0', 'f5f5f5']) + # Layout + plt.tight_layout() + + +def plot_legend(): + ''' + Initialize the plot legend. + ''' + plt.legend( + loc='best', + fontsize=font_size, + numpoints=1, + borderpad=.2, # default: 0.4 + labelspacing=.25, # default: 0.5 + handlelength=2.0, # default: 2.0 + handletextpad=.4, # default: 0.8 + borderaxespad=.5, # default: 0.5 + ) + + +def save_plot(path): + ''' + Save the plot to a file. + + Parameters: + path - The desired location. + Returns: + Nothing. + ''' + # Save plot + os.makedirs(os.path.dirname(path), exist_ok=True) + plt.savefig(path, bbox_inches='tight') + # Clean up + plt.clf() + plt.close() + + +def print_lmdks(seq, lmdks): + ''' + Print the landmarks. + + Parameters: + seq - The point sequence. + lmdks - The landmarks. + Returns: + Nothing. + ''' + print('\n######### Landmarks ##########') + print(lmdks) + print('[', end='', flush=True) + for p in seq: + if np.any(lmdks[:] == p): + print('\'', end='', flush=True) + else: + print('.', end='', flush=True) + print(']', end='', flush=True) + print('\n##############################\n') + + +def plot_dist(seq, dist): + ''' + Plot the probability distribution. + + Parameters: + seq - The point sequence. + size - The landmarks' size. + dist - The distribution type code. + -1 - Left-skewed + 0 - Symmetric + +1 - Right-skewed + +2 - Bimodal + +3 - Uniform + Returns: + Nothing. + ''' + scale = len(seq)/10 + p = stats.uniform.pdf(seq) + if dist == -1 or dist == 0 or dist == 1: + p = MixtureModel([truncnorm(min(seq), max(seq), get_loc(seq, dist), scale)]).pdf(seq) + elif dist == 2: + p = MixtureModel([ + truncnorm(min(seq), max(seq), get_loc(seq, -1), scale), + truncnorm(min(seq), max(seq), get_loc(seq, +1), scale) + ]).pdf(seq) + plt.plot(seq, p) + plt.show() + + +def simplify_data(seq, lmdks): + ''' + Get synthetic from real data. + + Parameters: + seq - The trajectory. + lmdks - The landmarks. + Returns: + The simplified sequence and landmarks. + ''' + seq_s = get_seq(1, len(seq)) + lmdks_s = [] + for i, p in enumerate(seq): + if is_landmark(p, lmdks): + lmdks_s.append(i + 1) + return seq_s, np.array(lmdks_s) + + +def truncnorm(lower, upper, loc, scale): + ''' + A truncated normal continuous random variable. + + Parameters: + lower - The lower bound of the range. + upper - The upper bound of the range. + loc - The location of the distribution. + scale - The spread of the distribution. + Returns: + A truncated normal continuous random variable. + ''' + return stats.truncnorm((lower - loc)/scale, (upper - loc)/scale, loc, scale) + + +def is_valid(coord): + ''' + Check if coordinates are valid. + + Parameters: + coord - The coordinates to be checked. + Returns: + True - Valid coordinates. + False - Invalid coordinates. + ''' + if coord is None or (float(coord[0]) > 90 or float(coord[0]) < -90 # lat + or float(coord[1]) > 180 or float(coord[1]) < -180): #lng + return False + return True + + +def save_data(args, data, name): + ''' + Save to file and compress. + + Parameters: + args - The execution arguments. + data - The data. + name - The name. + Returns: + Nothing. + ''' + try: + path = os.path.dirname(os.path.abspath(args.res)) + '/' + name + if 'lmdks' in name: + path += '(dis=' + str(args.dist) + ', per=' + str(args.per) + ')' + print('Saving to %s... ' %(path), end='', flush=True) + with open(path, 'wb') as file: + _, idx = np.unique(data, axis=0, return_index=True) + np.save(file, data[np.sort(idx)], allow_pickle=False) + print('[OK]') + # Compress file + print('Compressing in %s... ' %(args.res), end='', flush=True) + with zipfile.ZipFile(args.res, 'a') as zip_file: + zip_file.write(path, os.path.basename(path)) + print('[OK]') + # Delete file + print('Deleting %s... ' %(path), end='', flush=True) + os.remove(path) + print('[OK]') + except Exception as e: + print('[Error: %s]' %(e)) + + +def load_data(args, name): + ''' + Load data from file. + + Parameters: + args - The execution arguments. + data - The data. + name - The name. + Returns: + data - The data set. + 0: uid, 1: lat, 2: lng, 3: tim + ''' + data = np.array([]) + try: + path = os.path.dirname(os.path.abspath(args.res)) + '/' + name + if 'lmdks' in name: + path += '(dis=' + str(args.dist) + ', per=' + str(args.per) + ')' + print('Loading %s... ' %(path), end='', flush=True) + with zipfile.ZipFile(args.res, 'r') as zip_file: + data = np.load(zip_file.open(os.path.basename(path))) + print('[OK]') + except Exception as e: + print('[Error: %s]' %(e)) + return data + + +def find_lmdks(usrs_data, args): + ''' + Find users' landmarks. + + Parameters: + args - The execution arguments. + usrs_data - The users' data. + 0: uid, 1: lat, 2: lng, 3: tim + Returns: + usrs_lmdks - The users' landmarks. + 0: lid, 1: uid, 2: lat, 3: lng, 4: tim + ''' + usrs_lmdks = np.empty((0,5), np.float32) + traj_cur = 0 + lmdk_id = 0 + usrs = np.unique(usrs_data[:,0]) + for usr_i, usr in enumerate(usrs): + # Initialize user's landmarks list + lmdks = [] + traj = usrs_data[usrs_data[:,0]==usr, :] + traj_cur += len(traj) + print( + '[%d%%] Points: %d/%d | Users: %d/%d... ' + %((traj_cur/usrs_data.shape[0])*100, traj_cur, usrs_data.shape[0], usr, usrs[len(usrs) - 1]), + end='', flush=True + ) + # Check the actual points + i = 0 + while i < len(traj) - 1: + lmdk_cur = [] + if is_valid((traj[i][1], traj[i][2])): + for j in range(i + 1, len(traj)): + if is_valid((traj[j][1], traj[j][2])): + # Add the beginning only the first time + if j == i + 1: + lmdk_cur.append([lmdk_id, usr, traj[i][1], traj[i][2], traj[i][3]]) + # Add the new point + lmdk_cur.append([lmdk_id, usr, traj[j][1], traj[j][2], traj[j][3]]) + # Distance in meters + dist = distance((traj[i][1], traj[i][2]), (traj[j][1], traj[j][2])).km*1000 + # Distance exceeded or reached end of iteration + if dist > args.dist or j == len(traj) - 1: + # Period in minutes + per = abs(datetime.fromtimestamp(int(traj[i][3])) - datetime.fromtimestamp(int(traj[j][3]))).total_seconds()/60 + # Check if enough time passed + if per > args.per: + # usrs_id starts from 1 + lmdk_id += 1 + # Assign id to current landmark + for l in lmdk_cur: + l[0] = lmdk_id + # Append current landmark + lmdks += lmdk_cur + # Continue checking from the current point + i = j + break + # No landmark was found, continue from next point + if i == 0 or not is_valid((traj[i][1], traj[i][2])) or 'j' not in vars() or i != j: + i += 1 + print('[OK]') + if lmdks: + usrs_lmdks = np.append(usrs_lmdks, np.asarray(lmdks, dtype=np.float32), axis=0) + return usrs_lmdks + + +def lmdks_stats(args, usrs_lmdks): + ''' + Generate landmarks' stats. + + Parameters: + args - The execution arguments. + usrs_lmdks - The user' landmarks. + 0: lid, 1: uid, 2: lat, 3: lng, 4: tim + Returns: + Nothing. + ''' + lmdks_stats = { + 'total': 0, + 'len': 0, + 'count': {}, + } + # Check every user + usrs = np.unique(usrs_lmdks[:, 1]) + for usr in usrs: + print( + '[%d%%] Calculating landmark stats for user %d/%d... ' + %(usr*100/len(usrs), usr, np.max(usrs)), + end='', flush=True + ) + # Check each user's landmarks + for lmdk in np.unique(usrs_lmdks[usrs_lmdks[:, 1] == usr, 0]): + lmdk_traj = usrs_lmdks[usrs_lmdks[:, 0] == lmdk, :] + lmdks_stats['total'] += 1 + lmdks_stats['len'] += len(lmdk_traj) + if lmdks_stats['count'].get(len(lmdk_traj)) == None: + lmdks_stats['count'][len(lmdk_traj)] = 1 + else: + lmdks_stats['count'][len(lmdk_traj)] += 1 + print('[OK]') + hist_min, hist_max = min(lmdks_stats['count'], key=int, default=0), max(lmdks_stats['count'], key=int, default=0) + # Make histogram + x = [] + for i in range(hist_min, hist_max + 1): + if lmdks_stats['count'].get(i) != None: + x.extend([i]*lmdks_stats['count'].get(i)) + # Show stats + print( + '\n############ Stats ###########\n' + 'Landmarks : %d\n' + ' Length\n' + ' Total : %d (%.2f%%)\n' + ' Minimum : %d\n' + ' Maximum : %d\n' + '##############################\n' + %(lmdks_stats['total'], lmdks_stats['len'], (lmdks_stats['len']/args.time)*100, min(lmdks_stats['count'], key=int, default=0), max(lmdks_stats['count'], key=int, default=0)) + ) + # # Initialize plot + # plot_init() + # # Set x axis + # plt.xlabel('Landmarks sequence length') + # plt.xticks(rotation='vertical') + # # The y axis + # plt.ylabel('Number of sequences') + # plt.yscale('log') + # Create histogram + # plt.hist(x, bins=(hist_max - hist_min)) + # Show plot + # plt.show() + # Save plot + # save_plot(os.path.dirname(args.arc) + '/' + 'results' + '(dis=' + str(args.dist) + ', per=' + str(args.per) + ').pdf') + + +def should_sample(samp_rt): + ''' + Randomly decide to release with noise. + + Parameters: + samp_rt - The sampling rate (0, 1] + Returns: + True/False + ''' + return random.choices(population=[True, False], weights=[samp_rt, 1 - samp_rt], k=1)[0] + + +def is_landmark(p, lmdks): + ''' + Check is a point is a landmark. + + Parameters: + p - The point to check. + lmdks - A 2d array of landmarks. + Returns: + True/False + ''' + if len(lmdks) and any(np.equal(lmdks[:, 1:5], p).all(1)): + return True + return False