#!/usr/bin/env python3 import ast from datetime import datetime from geopy.distance import distance import heapq import itertools from scipy.special import lambertw import math import numpy as np import os import random import scipy.stats as stats import time from matplotlib import pyplot as plt import zipfile # Plot globals dpi = 300 font_size = 24.0 line_width = 2.0 marker_size = 14.0 tick_length = 8.0 def add_polar_noise(loc, epsilon): ''' Add noise from planar Laplace. [https://github.com/chatziko/location-guard/blob/master/src/js/common/laplace.js] Parameters: loc - The original location. epsilon - The privacy budget. Returns: The perturbed location (lat, lng). ''' # The bearing in radians. # Random number in [0, 2*pi), theta = np.random.uniform(low = 0, high = np.pi*2) # The distance in meters. # Use the inverse cumulative polar laplacian distribution function. r = -np.real((lambertw((np.random.uniform(low = 0, high = 1) - 1)/np.e, k = -1) + 1)/epsilon) new_loc = None while not is_valid(new_loc): new_loc = distance(kilometers = r/1000).destination(point = loc, bearing = np.degrees(theta)) return new_loc def randomized_response(r, epsilon): ''' Return the original response with probability p, or flip the response with probability 1 - p. [https://www.usenix.org/system/files/conference/usenixsecurity17/sec17-wang-tianhao.pdf] Parameters: r - The original response (True/False). epsilon - The privacy budget. Returns: The randomized response. ''' # Return the original value if random.uniform(0, 1) <= math.exp(epsilon)/(math.exp(epsilon) + 1): return r # Flip the original value else: return not r def add_laplace_noise(value, sens, epsilon): ''' Add random noise to a data value. Parameteres: value - The value to add noise to. sens - The query function sensitivity. epsilon - The privacy budget. Returns: The value with the noise. ''' return np.random.laplace(value, sens/epsilon) def draw_line(line, x, label, marker): axis_x = list(range(len(x))) # plt.xticks(axis_x, x) plt.plot(axis_x, x, label=label, marker=marker) def eval_seq(dists): ''' Calculate the standard deviation of a list of distances. Parameters: dists - A list of distances. Returns: The standard deviation of the distances. ''' return np.std(dists) def get_abs_dists(seq, comb, lmdks): ''' Get the distances of the points in a combination from the start and end of the sequence. Parameters: seq - The point sequence. comb - A possible point combination for a specified size. lmdks - The landmarks. Returns: dists - The distances of the points in a combination. ''' cur = np.append(comb, lmdks) cur.sort() start = seq[0] end = seq[len(seq) - 1] dists = np.zeros([len(cur)*2]) # Check if there are any points if cur.size: for i in range(0, len(cur)): # From the start dists[i*2] = abs(cur[i] - start) # From the end dists[i*2 + 1] = abs(cur[i] - end) return dists def get_combs(seq, size): ''' Get all the possible landmark point combinations for a specified size. Parameters: seq - The point sequence. size - The desired number of landmarks. Returns: All the possible landmark combinations for a specified size. ''' return itertools.combinations(seq, size) def get_combs_h(h, hist, max): combs = [] for comb in itertools.product(range(h + 1), repeat=len(hist)): check = True for i, c in enumerate(comb): if sum(comb) > max or c < hist[i] or c > h: check = False break if check: combs.append(comb) return combs def get_emd(h1, h2): ''' The earth mover's distance (EMD), or Wasserstein distance, of two histograms. [https://stats.stackexchange.com/a/151362] Pele et al., The quadratic-chi histogram distance family Parameters: h1 - A histogram. h2 - Another histogram. Return: The EMD of the histograms. ''' return stats.wasserstein_distance(h1,h2) # return stats.wasserstein_distance(get_norm_hist(h1), get_norm_hist(h2)) def get_hist(seq, lmdks): ''' Create a histogram from a set of landmarks. Parameters: seq - The point sequence. lmdks - The landmarks. Return: hist - The histogram of landmarks. h - The bin size. ''' # # Create a landmark sequence relative to the entire sequence # lmdks_rel = np.array(lmdks) # # Add the start of the sequence if not in landmarks # if not np.any(lmdks_rel[:] == start): # lmdks_rel = np.insert(lmdks_rel, 0, start) # # Add the end of the sequence if not in landmarks # if not np.any(lmdks_rel[:] == end): # lmdks_rel = np.append(lmdks_rel, end) # Dealing with zeros. if len(seq) == 0: return np.zeros(math.ceil(max(seq))), 1 elif len(lmdks) == 0: return np.zeros(1), len(seq) # Interquartile range (IQR) is a measure of statistical dispersion, being equal to the difference between 75th and 25th percentiles, or between upper and lower quartiles. # https://en.wikipedia.org/wiki/Interquartile_range iqr = stats.iqr(lmdks) # Use the Freedman–Diaconis rule to select the width of the bins to be used in the histogram. # Robust (resilient to outliers) estimator that takes into account data variability and data size. # https://en.wikipedia.org/wiki/Freedman%E2%80%93Diaconis_rule h = 2*iqr*len(lmdks)**(-1/3) if h < 1: h = 1 # On the number of bins and width: # https://en.wikipedia.org/wiki/Histogram#Number_of_bins_and_width # Normalize the interval h = math.ceil(max(seq)/math.ceil(max(seq)/math.ceil(h))) # Create an empty histogram with intervals of size h hist = np.zeros(math.ceil(max(seq)/h), dtype = int) for lmdk in lmdks: hist[int(lmdk/h) - 1] += 1 return hist, h def get_lmdks(seq, size, dist): ''' Get a landmark set for a distribution type. Parameters: seq - The point sequence. size - The landmarks' size. dist - The distribution type code. -1 - Left-skewed 0 - Symmetric +1 - Right-skewed +2 - Bimodal +3 - Uniform Return: hist - The histogram of landmarks. h - The bin size. ''' scale = len(seq)/10 if dist == -1 or dist == 0 or dist == 1: return MixtureModel([truncnorm(min(seq), max(seq), get_loc(seq, dist), scale)]).sample(min(seq), max(seq), [], size) elif dist == 2: return MixtureModel([ truncnorm(min(seq), max(seq), get_loc(seq, -1), scale), truncnorm(min(seq), max(seq), get_loc(seq, +1), scale) ]).sample(min(seq), max(seq), [], size) else: return np.sort(np.random.choice(np.arange(min(seq), max(seq) + 1, 1), size=size, replace=False)) def get_lmdks_rand(seq, size, skew): ''' Get a landmark set with a given skewness. Parameters: seq - The point sequence. size - The landmarks' size. skew - The desired skewness. Return: hist - The histogram of landmarks. h - The bin size. ''' # Get all possible landmark combinations lmdk_combs = get_combs(seq, size) # Find a set of landmarks for the given requirements lmdks = () for comb in lmdk_combs: skew_cur = get_skew(comb) # Check if skewness is close to the requirement or have the same sign if math.isclose(skew_cur, skew, rel_tol=.1) or ((skew_cur < 0) == (skew < 0)): lmdks = comb break return lmdks def get_loc(seq, skew): ''' Get the location of the sequence with a given skewness. Parameters: seq - The point sequence. skew - The desired skewness. Return: The location. ''' loc_min = min(seq) loc_max = max(seq) if skew < 0: loc_min = (max(seq) - min(seq))/2 elif skew > 0: loc_max = (max(seq) - min(seq))/2 return int(loc_min + (loc_max - loc_min)/2) def get_mean(seq, hist, h): ''' Get the mean of the histogram. Parameters: seq - The point sequence. hist - The histogram of landmarks. h - The bin size. Return: The mean of the histogram. ''' sum = 0 for idx, count in enumerate(hist): # Find bin limits start = min(seq) + h*idx end = min(seq) + h*(idx + 1) if(end > max(seq)): end = max(seq) sum += (start + (end - start)/2)*count return sum/np.sum(hist) def get_norm(h1, h2): ''' The Euclidean distance of two histograms. [https://stackoverflow.com/a/1401828/13123075] Parameters: h1 - A histogram. h2 - Another histogram. Return: The Euclidean distance of the histograms. ''' return np.linalg.norm(h1 - h2) def get_norm_hist(hist): ''' Normalize the histogram. Parameters: hist - The original histogram. Return: The normalized histogram. ''' # In-place type conversion norm_hist = hist.astype(np.float32) n = np.sum(norm_hist) for i, a in enumerate(norm_hist): if a: norm_hist[i] = a/n return norm_hist def get_opts(seq, lmdks): ''' Finds all the possible valid options. Parameters: seq - The point sequence. lmdks - The landmarks. Returns: A list with all the possible options. Requirements: n = Regular points r = The size of a combination Time - O(C(n, r) + 2^C(n, r)), because O(n*C(n, r)) = O(n*n^min(r, n - r)) Space - O(r*C(n, r)) ''' reg = get_reg(seq, lmdks) # Find all possible combinations for every k combs = [] for k in range(len(lmdks) + 1, seq[len(seq) - 1] + 1): combs.append(get_combs(reg, k - len(lmdks))) # Find all possible options for all combinations opts = itertools.product(*combs) # Keep only valid options, i.e., larger sets must contain every element of every smaller set rslts = [] for opt in opts: for i, _ in enumerate(opt): if i and not set(opt[i - 1]).issubset(opt[i]): break if i == len(opt) - 1: rslts.append(opt) return rslts def get_reg(seq, lmdks): ''' Get the regular points, i.e., non-landmarks, in a sequence. Parameters: seq - The point sequence. lmdks - The landmarks. Returns: The non-landmark points in a sequence. ''' return np.array([i for i in seq if i not in lmdks]) def get_rel_dists(seq, comb, lmdks): ''' Get the distances of the points in a combination from the nearest point of reference. That is, the previous/next point or the start/end if closer. Parameters: seq - The point sequence. comb - A possible point combination for a specified size. lmdks - The landmarks. Returns: dists - The distances of the points in a combination. ''' # TODO: Review distances. Maybe take into account both ways? cur = np.append(comb, lmdks) cur.sort() start = seq[0] end = seq[len(seq) - 1] dists = np.zeros([len(cur) + 1]) # Check if there are any points if cur.size: # First dists[0] = abs(cur[0] - start) # In-between for i in range(0, len(cur) - 1): dists[i + 1] = abs(cur[i] - cur[i + 1]) # Last dists[len(cur)] = abs(cur[len(cur) - 1] - end) return dists def get_seq(start, end): ''' Get a sequence of points. Parameters: start - The starting point of the sequence. end - The ending point of the sequence. Returns: A point sequence. ''' return np.arange(start, end + 1) def get_shannon_ent(a): ''' Get the Shannon entropy of an array. It can be used to calculate the uniformity of a histogram. Uniform probability yields maximum uncertainty and therefore maximum entropy. Entropy, then, can only decrease from the value associated with uniform probability. Parameters: a - An array of integers Return: The Shannon entropy. ''' # Keep only the non-zero elements of array a because log2(0) is -inf. # The entropy of an event with zero probability is 0. a = a[a != 0] # Histograms are discrete probability distributions. # We convert the counts to probabilities. p_a = a/np.sum(a) # Base 2 gives the unit of bits (or 'shannons'). return -np.sum(p_a*np.log2(p_a)) def get_shannon_ent_norm(a): ''' Get the normalized Shannon entropy of an array. It ranges from 0 (high entropy - close to uniform) to 1 (low entropy). It ranges from 0 (high entropy - close to uniform) to 1 (low entropy). It ranges from 0 (high entropy - close to uniform) to 1 (low entropy). Parameters: a - An array of integers Return: The normalized Shannon entropy. ''' return 1 - get_shannon_ent(a)/np.log2(len(a)) def get_skew(lmdks): ''' Fisher-Pearson coefficient of skewness. negative - left (more data items right) zero - symmetric data positive - right (more data items left) positive - right (more data items left) positive - right (more data items left) Parameters: seq - The point sequence. hist - The histogram of landmarks. h - The bin size. Return: Fisher-Pearson coefficient of skewness. ''' # def get_skew(seq, hist, h): # return get_third(seq, hist, h)/get_std(seq, hist, h)**3 return stats.skew(lmdks) def get_std(seq, hist, h): ''' Get the standard deviation. Parameters: seq - The point sequence. hist - The histogram of landmarks. h - The bin size. Return: The standard deviation. ''' hist_mean = get_mean(seq, hist, h) sum = 0 for idx, count in enumerate(hist): # Find bin limits start = min(seq) + h*idx end = min(seq) + h*(idx + 1) if(end > max(seq)): end = max(seq) sum += (((start + (end - start)/2) - hist_mean)**2)*count return sum/np.sum(hist) def get_third(seq, hist, h): ''' Get the third central moment. Parameters: seq - The point sequence. hist - The histogram of landmarks. h - The bin size. Return: The third central moment. ''' hist_mean = get_mean(seq, hist, h) sum = 0 for idx, count in enumerate(hist): # Find bin limits start = min(seq) + h*idx end = min(seq) + h*(idx + 1) if(end > max(seq)): end = max(seq) sum += (((start + (end - start)/2) - hist_mean)**3)*count return sum/np.sum(hist) def dist_type_to_str(d): ''' Convert the distribution type code to string. Parameters: t - The distribution type code. -1 - Left-skewed 0 - Symmetric +1 - Right-skewed +2 - Bimodal +3 - Uniform Return: The distribution type. ''' if d == -1: return 'Left-skewed' elif d == 0: return 'Symmetric' elif d == 1: return 'Right-skewed' elif d == 2: return 'Bimodal' elif d == 3: return 'Uniform' else: return 'Undefined' class MixtureModel(stats.rv_continuous): ''' A mixture model of continuous probability distributions [https://stackoverflow.com/a/47763145/13123075] ''' def __init__(self, submodels, *args, **kwargs): super().__init__(*args, **kwargs) self.submodels = submodels def _pdf(self, x): pdf = self.submodels[0].pdf(x) for submodel in self.submodels[1:]: pdf += submodel.pdf(x) pdf /= len(self.submodels) return pdf def rvs(self, size): submodel_choices = np.random.randint(len(self.submodels), size=size) submodel_samples = [submodel.rvs(size=size) for submodel in self.submodels] rvs = np.choose(submodel_choices, submodel_samples) return rvs def sample(self, lower, upper, sample, size): ''' Sample from the mixture model without replacement. [https://stackoverflow.com/a/20548895/13123075] Parameters: self - The mixture model. lower - The lower bound of the range. upper - The upper bound of the range. sample - Items to be excluded. size - The sample size. Returns: A sample. ''' w = self.pdf(range(int(lower), int(upper) + 1)) idx = [] for i in range(int(lower), int(upper) + 1): if i not in sample: idx.append(i) elt = [(math.log(random.random()) / w[i - 1], i) for i in idx] return np.sort([x[1] for x in heapq.nlargest(size, elt)]) def plot_init(): ''' Initialize the plot. ''' # Reset plt.close('all') # Style plt.style.use('classic') # DPI plt.figure(dpi=dpi) # Font plt.rc('font', family='sans-serif') plt.rc('font', **{'sans-serif':['Liberation Sans']}) plt.rc('font', size=font_size) # Grid plt.setp(plt.figure().add_subplot(111).spines.values(), linewidth=line_width) plt.grid(True, axis='y', linewidth=line_width) # Ticks plt.gca().tick_params(which='both', width=line_width) plt.gca().tick_params(which='major', length=tick_length) plt.gca().tick_params(which='minor', length=tick_length/2) # Colors plt.subplot(111).set_prop_cycle('color', ['#212121', '#616161', '#9e9e9e', '#bdbdbd', '#e0e0e0', 'f5f5f5']) # Layout plt.tight_layout() def plot_legend(): ''' Initialize the plot legend. ''' plt.legend( loc='best', fontsize=font_size, numpoints=1, borderpad=.2, # default: 0.4 labelspacing=.25, # default: 0.5 handlelength=2.0, # default: 2.0 handletextpad=.4, # default: 0.8 borderaxespad=.5, # default: 0.5 ) def save_plot(path): ''' Save the plot to a file. Parameters: path - The desired location. Returns: Nothing. ''' # Save plot os.makedirs(os.path.dirname(path), exist_ok=True) plt.savefig(path, bbox_inches='tight') # Clean up plt.clf() plt.close() def print_lmdks(seq, lmdks): ''' Print the landmarks. Parameters: seq - The point sequence. lmdks - The landmarks. Returns: Nothing. ''' print('\n######### Landmarks ##########') print(lmdks) print('[', end='', flush=True) for p in seq: if np.any(lmdks[:] == p): print('\'', end='', flush=True) else: print('.', end='', flush=True) print(']', end='', flush=True) print('\n##############################\n') def plot_dist(seq, dist): ''' Plot the probability distribution. Parameters: seq - The point sequence. size - The landmarks' size. dist - The distribution type code. -1 - Left-skewed 0 - Symmetric +1 - Right-skewed +2 - Bimodal +3 - Uniform Returns: Nothing. ''' scale = len(seq)/10 p = stats.uniform.pdf(seq) if dist == -1 or dist == 0 or dist == 1: p = MixtureModel([truncnorm(min(seq), max(seq), get_loc(seq, dist), scale)]).pdf(seq) elif dist == 2: p = MixtureModel([ truncnorm(min(seq), max(seq), get_loc(seq, -1), scale), truncnorm(min(seq), max(seq), get_loc(seq, +1), scale) ]).pdf(seq) plt.plot(seq, p) plt.show() def simplify_data(seq, lmdks): ''' Get synthetic from real data. Parameters: seq - The trajectory. lmdks - The landmarks. Returns: The simplified sequence and landmarks. ''' seq_s = get_seq(1, len(seq)) lmdks_s = [] for i, p in enumerate(seq): if is_landmark(p, lmdks): lmdks_s.append(i + 1) return seq_s, np.array(lmdks_s) def truncnorm(lower, upper, loc, scale): ''' A truncated normal continuous random variable. Parameters: lower - The lower bound of the range. upper - The upper bound of the range. loc - The location of the distribution. scale - The spread of the distribution. Returns: A truncated normal continuous random variable. ''' return stats.truncnorm((lower - loc)/scale, (upper - loc)/scale, loc, scale) def is_valid(coord): ''' Check if coordinates are valid. Parameters: coord - The coordinates to be checked. Returns: True - Valid coordinates. False - Invalid coordinates. ''' if coord is None or (float(coord[0]) > 90 or float(coord[0]) < -90 # lat or float(coord[1]) > 180 or float(coord[1]) < -180): #lng return False return True def save_data(args, data, name): ''' Save to file and compress. Parameters: args - The execution arguments. data - The data. name - The name. Returns: Nothing. ''' try: path = os.path.dirname(os.path.abspath(args.res)) + '/' + name if 'lmdks' in name: path += '(dis=' + str(args.dist) + ', per=' + str(args.per) + ')' print('Saving to %s... ' %(path), end='', flush=True) with open(path, 'wb') as file: _, idx = np.unique(data, axis=0, return_index=True) np.save(file, data[np.sort(idx)], allow_pickle=False) print('[OK]') # Compress file print('Compressing in %s... ' %(args.res), end='', flush=True) with zipfile.ZipFile(args.res, 'a') as zip_file: zip_file.write(path, os.path.basename(path)) print('[OK]') # Delete file print('Deleting %s... ' %(path), end='', flush=True) os.remove(path) print('[OK]') except Exception as e: print('[Error: %s]' %(e)) def load_data(args, name): ''' Load data from file. Parameters: args - The execution arguments. data - The data. name - The name. Returns: data - The data set. 0: uid, 1: lat, 2: lng, 3: tim ''' data = np.array([]) try: path = os.path.dirname(os.path.abspath(args.res)) + '/' + name if 'lmdks' in name: path += '(dis=' + str(args.dist) + ', per=' + str(args.per) + ')' print('Loading %s... ' %(path), end='', flush=True) with zipfile.ZipFile(args.res, 'r') as zip_file: data = np.load(zip_file.open(os.path.basename(path))) print('[OK]') except Exception as e: print('[Error: %s]' %(e)) return data def find_lmdks(usrs_data, args): ''' Find users' landmarks. Parameters: args - The execution arguments. usrs_data - The users' data. 0: uid, 1: lat, 2: lng, 3: tim Returns: usrs_lmdks - The users' landmarks. 0: uid, 1: lat, 2: lng, 3: tim ''' usrs_lmdks = np.empty((0,4), np.float32) traj_cur = 0 usrs = np.unique(usrs_data[:,0]) for usr_i, usr in enumerate(usrs): # Initialize user's landmarks list lmdks = [] traj = usrs_data[usrs_data[:,0]==usr, :] traj_cur += len(traj) print( '[%d%%] Points: %d/%d | Users: %d/%d... ' %((traj_cur/usrs_data.shape[0])*100, traj_cur, usrs_data.shape[0], usr, usrs[len(usrs) - 1]), end='', flush=True ) # Check the actual points i = 0 while i < len(traj) - 1: lmdk_cur = [] if is_valid((traj[i][1], traj[i][2])): for j in range(i + 1, len(traj)): if is_valid((traj[j][1], traj[j][2])): # Add the beginning only the first time if j == i + 1: lmdk_cur.append([usr, traj[i][1], traj[i][2], traj[i][3]]) # Add the new point lmdk_cur.append([usr, traj[j][1], traj[j][2], traj[j][3]]) # Distance in meters dist = distance((traj[i][1], traj[i][2]), (traj[j][1], traj[j][2])).km*1000 # Distance exceeded or reached end of iteration if dist > args.dist or j == len(traj) - 1: # Period in minutes per = abs(datetime.fromtimestamp(int(traj[i][3])) - datetime.fromtimestamp(int(traj[j][3]))).total_seconds()/60 # Check if enough time passed if per > args.per: # Append current landmark lmdks += lmdk_cur # Continue checking from the current point i = j break # No landmark was found, continue from next point if i == 0 or not is_valid((traj[i][1], traj[i][2])) or 'j' not in vars() or i != j: i += 1 print('[OK]') if lmdks: usrs_lmdks = np.append(usrs_lmdks, np.asarray(lmdks, dtype=np.float32), axis=0) return usrs_lmdks def find_lmdks_seq(seq, lmdks): lmdks_seq = [] for i, p in enumerate(seq): if is_landmark(p, lmdks): lmdks_seq.append(i + 1) return np.array(lmdks_seq, dtype = int) def find_lmdks_tim(lmdk_data, seq, uid, pct): ''' Find user's landmarks timestamps. Parameters: lmdk_data - The landmarks contacts for all users per landmarks percentage. 0: uid, 1: lmdk_pct, 2: contacts seq - The users' data. 0: uid, 1: lmdk_pct, 2: contacts uid - The user's id that we are interested in. pct - The landmarks percentage. Returns: lmdks - The user's landmarks timestamps for the given landmarks percentage. 0: tim, 1: uid_a, 2: uid_b, 3: rssi ''' # Initialize user's landmarks lmdks = np.empty(0) # All the sequence if pct == 100: # Get all timestamps return seq[:, 1] # Find for given percentage elif pct != 0: # All user's landmark contacts for all landmarks percentages usr_lmdks = lmdk_data[lmdk_data[:, 0] == uid] # User's landmark contacts for given percentage pct_lmdks = ast.literal_eval(usr_lmdks[usr_lmdks[:, 1] == str(pct/100)][0][2]) # Find landmarks timestamps for u in pct_lmdks: lmdks = np.concatenate((lmdks, np.array(seq[seq[:, 2] == float(u)])[:, 0])) return lmdks def find_lmdks_cont(lmdk_data, seq, uid, pct): ''' Find user's landmarks contacts. Parameters: lmdk_data - The landmarks contacts for all users per landmarks percentage. 0: uid, 1: lmdk_pct, 2: contacts seq - The users' data. 0: tim, 1: uid, 2: cont, 3: rssi uid - The user's id that we are interested in. pct - The landmarks percentage. Returns: lmdks - The user's landmarks contacts for the given landmarks percentage. 0: tim, 1: uid, 2: cont, 3: rssi ''' # Initialize user's landmarks lmdks = np.empty(0).reshape(0,4) # All the sequence if pct == 100: # Get all timestamps return seq # Find for given percentage elif pct != 0: # All user's landmark contacts for all landmarks percentages usr_lmdks = lmdk_data[lmdk_data[:, 0] == uid] conts = ast.literal_eval(usr_lmdks[usr_lmdks[:, 1] == str(pct/100)][0][2]) # Actual landmarks for c in conts: lmdks = np.vstack([lmdks, seq[seq[:, 2] == c]]) return lmdks def lmdks_stats(args, usrs_lmdks): ''' Generate landmarks' stats. Parameters: args - The execution arguments. usrs_lmdks - The user' landmarks. 0: lid, 1: uid, 2: lat, 3: lng, 4: tim Returns: Nothing. ''' lmdks_stats = { 'total': 0, 'len': 0, 'count': {}, } # Check every user usrs = np.unique(usrs_lmdks[:, 1]) for usr in usrs: print( '[%d%%] Calculating landmark stats for user %d/%d... ' %(usr*100/len(usrs), usr, np.max(usrs)), end='', flush=True ) # Check each user's landmarks for lmdk in np.unique(usrs_lmdks[usrs_lmdks[:, 1] == usr, 0]): lmdk_traj = usrs_lmdks[usrs_lmdks[:, 0] == lmdk, :] lmdks_stats['total'] += 1 lmdks_stats['len'] += len(lmdk_traj) if lmdks_stats['count'].get(len(lmdk_traj)) == None: lmdks_stats['count'][len(lmdk_traj)] = 1 else: lmdks_stats['count'][len(lmdk_traj)] += 1 print('[OK]') hist_min, hist_max = min(lmdks_stats['count'], key=int, default=0), max(lmdks_stats['count'], key=int, default=0) # Make histogram x = [] for i in range(hist_min, hist_max + 1): if lmdks_stats['count'].get(i) != None: x.extend([i]*lmdks_stats['count'].get(i)) # Show stats print( '\n############ Stats ###########\n' 'Landmarks : %d\n' ' Length\n' ' Total : %d (%.2f%%)\n' ' Minimum : %d\n' ' Maximum : %d\n' '##############################\n' %(lmdks_stats['total'], lmdks_stats['len'], (lmdks_stats['len']/args.time)*100, min(lmdks_stats['count'], key=int, default=0), max(lmdks_stats['count'], key=int, default=0)) ) # # Initialize plot # plot_init() # # Set x axis # plt.xlabel('Landmarks sequence length') # plt.xticks(rotation='vertical') # # The y axis # plt.ylabel('Number of sequences') # plt.yscale('log') # Create histogram # plt.hist(x, bins=(hist_max - hist_min)) # Show plot # plt.show() # Save plot # save_plot(os.path.dirname(args.arc) + '/' + 'results' + '(dis=' + str(args.dist) + ', per=' + str(args.per) + ').pdf') def should_sample(samp_rt): ''' Randomly decide to release with noise. Parameters: samp_rt - The sampling rate (0, 1] Returns: True/False ''' return random.choices(population=[True, False], weights=[samp_rt, 1 - samp_rt], k=1)[0] def is_landmark(p, lmdks): ''' Check is a point is a landmark. Parameters: p - The point to check. lmdks - A 2d array of landmarks. Returns: True/False ''' if len(lmdks) > 0 and any(np.equal(lmdks, p).all(1)): return True return False