code: Added libraries

2021-07-25 16:51:51 +03:00
parent 6ec58b9adc
commit 7dc347cf53
2 changed files with 1446 additions and 0 deletions
--- a/code/lib/lmdk_bgt.py
+++ b/code/lib/lmdk_bgt.py
@ -0,0 +1,479 @@
 #!/usr/bin/env python3
 from geopy.distance import distance
 import math
 import numpy as np
 import random
 import sympy as sp
 import time
 import lmdk_lib
 from matplotlib import pyplot as plt
 def draw_bgts(title, bgts):
  '''
    Plots the allocated budget.
    Parameters:
      title - The title of the plot.
      bgts - The allocated privacy budget.
    Returns:
      Nothing.
  '''
  lmdk_lib.plot_init()
  p = np.arange(1, len(bgts) + 1, 1)
  plt.plot(
    p,
    bgts,
    linewidth=lmdk_lib.line_width,
    markersize=lmdk_lib.marker_size,
    markeredgewidth=0,
    label=r'$\varepsilon$',
    marker='s'
  )
  lmdk_lib.plot_legend()
  # Set plot box
  plt.axis([1, len(bgts), .0, max(bgts) +  max(bgts)/2])
  # Set x axis label
  plt.xlabel('Timestamp')
  # Set y axis label
  plt.ylabel('Privacy budget')
  plt.title(title.replace('_', '-'))
  plt.show()
 def validate_bgts(seq, lmdks, epsilon, bgts):
  '''
    Budget allocation validation.
    Parameters:
      seq - The point sequence.
      lmdks - The landmarks.
      epsilon - The available privacy budget.
      bgts - The privacy budget allocation.
    Returns:
      The index of the privacy budget that caused the failure or -1 if successful.
  '''
  bgts_sum = .0
  # Landmarks
  for i, p in enumerate(seq):
    if np.any(lmdks[:] == p):
      bgts_sum += bgts[i]
  # Regular events
  bgts_max = bgts_sum
  for i, p in enumerate(seq):
    if not np.any(lmdks[:] == p):
      bgts_cur = bgts_sum + bgts[i]
      if bgts_cur > bgts_max:
        bgts_max = bgts_cur
      if bgts_cur > epsilon and not math.isclose(bgts_cur, epsilon, rel_tol=.001):
        print('Budget allocation validation failed: %.2f%% (%.4f/%.4f)' %(100*bgts_max/epsilon, bgts_max, epsilon))
        return i
  print(
    'Landmark budget allocation: %.2f%% (%.4f/%.4f)\n' 
    'Overall budget allocation : %.2f%% (%.4f/%.4f)\n' 
    %(100*bgts_max/epsilon, bgts_max, epsilon,
    100*sum(bgts)/epsilon, sum(bgts), epsilon,))
  return -1
 def uniform(seq, lmdks, epsilon):
  '''
    Uniform budget allocation.
    Parameters:
      seq - The point sequence.
      lmdks - The landmarks.
      epsilon - The available privacy budget.
    Returns:
      bgts - The privacy budget allocation.
  '''
  bgts = np.zeros(len(seq))
  # Allocate enough budget for all landmarks and one regular point
  k = len(lmdks) + 1
  # All points are landmarks
  if k > len(seq):
    k = len(lmdks)
  # Allocate the budget
  for i, _ in enumerate(bgts):
    bgts[i] = epsilon/k
  return bgts
 def geometric(seq, lmdks, epsilon):
  '''
    Geometric budget allocation.
    Parameters:
      seq - The point sequence.
      lmdks - The landmarks.
      epsilon - The available privacy budget.
    Returns:
      bgts - The privacy budget distribution.
  '''
  bgts = np.zeros([len(seq)])
  epsilon_avail = epsilon
  # Landmarks
  for i, p in enumerate(seq):
    if np.any(lmdks[:] == p):
      bgts[i] = epsilon_avail/2
      epsilon_avail = epsilon_avail - epsilon_avail/2
  # Regular events
  for i, p in enumerate(seq):
    if not np.any(lmdks[:] == p):
      bgts[i] = epsilon_avail
  return bgts
 def exponential(seq, lmdks, epsilon):
  '''
    Exponential uniform budget allocation (max to user-level).
    Parameters:
      seq - The point sequence.
      lmdks - The landmarks.
      epsilon - The available privacy budget.
    Returns:
      bgts - The privacy budget allocation.
  '''
  # Fallback to uniform if zero or max landmarks
  if len(lmdks) == 0 or len(lmdks) == len(seq):
    return uniform(seq, lmdks, epsilon)
  # # In case of seq == lmdks
  # l = 0
  # N = epsilon/len(seq)
  # Otherwise
  bgts = np.zeros([len(seq)])
  if len(seq) != len(lmdks):
    # Find worst case regural point
    p_sel = 0
    for p in seq:
      if not np.any(lmdks[:] == p):
        p_sel = p
        break
    # List all landmark timestamps with the worst regular
    points = np.append(lmdks, [p_sel])
    points = np.sort(points)
    # epsilon_t = N*e^-l*t
    l = sp.symbols('l', real=True)
    # Cumulative allocation at landmarks and one extra point, i.e., L union {t}
    T = seq.max()
    # Bounding the privacy budgets at L union {t}
    # epsilon = sum(N*e^-l*t) for t in L union {t}
    t = sp.symbols('t')
    eq = 0
    for t in points:
      eq += (((epsilon/len(seq))/(sp.exp(-1*l*T)))*sp.exp(-1*l*t))
    try:
      l = sp.solve(eq - epsilon, simplify=False, rational=False)[0]
      # l = sp.solveset(eq <= epsilon, l, sp.S.Reals).sup
    except Exception:
      return bgts
    # Allocate to the last point T epsilon/len(seq), i.e., user-level
    N = (epsilon/len(seq))/sp.exp(-1*l*T)
  # Allocate the budget
  for i, t in enumerate(seq):
    bgts[i] = N*sp.exp(-1*l*t)
  return bgts
 def linear_zero(seq, lmdks, epsilon):
  '''
    Linear uniform budget allocation (max to zero).
    Parameters:
      seq - The point sequence.
      lmdks - The landmarks.
      epsilon - The available privacy budget.
    Returns:
      bgts - The privacy budget allocation.
  '''
  bgts = np.zeros([len(seq)])
  # Find worst case regural point
  p_sel = 0
  for p in seq:
    if not np.any(lmdks[:] == p):
      p_sel = p
      break
  # Sum all landmark timestamps with the worst regular
  sum_cur = p_sel
  for p in lmdks:
    sum_cur += p
  # epsilon_t = a*t + b
  b = sp.symbols('b')
  # Cumulative allocation at landmarks and one extra point
  T = seq[len(seq) - 1]
  b = sp.solve(b - ((b/T)*sum_cur + epsilon)/(len(lmdks) + 1))[0]
  # Allocate to the last point 0
  a = -b/T
  # Allocate the budget
  for i, t in enumerate(seq):
    bgts[i] = a*t + b
  return bgts
 def linear(seq, lmdks, epsilon):
  '''
    Linear uniform budget allocation (max to user-level).
    Parameters:
      seq - The point sequence.
      lmdks - The landmarks.
      epsilon - The available privacy budget.
    Returns:
      bgts - The privacy budget allocation.
  '''
  # Fallback to uniform if zero or max landmarks
  if len(lmdks) == 0 or len(lmdks) == len(seq):
    return uniform(seq, lmdks, epsilon)
  # Find worst case regural point
  p_sel = 0
  for p in seq:
    if not np.any(lmdks[:] == p):
      p_sel = p
      break
  # Sum all landmark timestamps with the worst regular
  sum_cur = p_sel + np.sum(lmdks)
  # epsilon_t = a*t + b
  b = sp.symbols('b', real=True)
  # Cumulative allocation at landmarks and one extra point, i.e., L union {t}
  T = seq.max()
  # Number of points to take into account
  k = len(lmdks) + 1
  # if len(lmdks) == len(seq):
  #   k = len(lmdks)
  # Bounding the privacy budgets at L union {t}
  # epsilon = a*sum(L union {t}) + |L union {t}|*b
  b = sp.solve(b + (((epsilon/len(seq) - b)/T)*sum_cur - epsilon)/k, simplify=False, rational=False)[0]
  # Allocate to the last point T epsilon/len(seq), i.e., user-level
  a = (epsilon/len(seq) - b)/T
  # Allocate the budget
  bgts = np.zeros([len(seq)])
  for i, t in enumerate(seq):
    bgts[i] = a*t + b
  return bgts
 def plot_line(x_i, arr, label, marker, line):
  plt.plot(x_i,
          arr,
          label=label,
          color='black',
          marker=marker,
          linestyle=line,
          # linewidth=1.0,
          markerfacecolor='none')
 def stepped(seq, lmdks, epsilon):
  '''
    Stepped budget allocation.
    Parameters:
      seq - The point sequence.
      lmdks - The landmarks.
      epsilon - The available privacy budget.
    Returns:
      bgts - The privacy budget allocation.
  '''
  bgts = np.zeros([len(seq)])
  epsilon_avail = epsilon/2
  for i, p in enumerate(seq):
    # Reduce the available budget when a landmark is reached
    if np.any(lmdks[:] == p):
      epsilon_avail = epsilon_avail/2
    bgts[i] = epsilon_avail
  return bgts
 def adaptive(seq, lmdks, epsilon):
  '''
    Adaptive budget allocation.
    Parameters:
      seq - The point sequence.
      lmdks - The landmarks.
      epsilon - The available privacy budget.
    Returns:
      rls_data - The perturbed data.
      bgts - The privacy budget allocation.
      skipped - The number of skipped releases.
  '''
  # Uniform budget allocation
  bgts = uniform(seq, lmdks, epsilon)
  # Released
  rls_data = [None]*len(seq)
  # The sampling rate
  samp_rt = 1
  # Track landmarks
  lmdk_cur = 0
  # Track skipped releases
  skipped = 0
  for i, p in enumerate(seq):
    # Check if current point is a landmark
    if lmdk_lib.is_landmark(p, lmdks):
      lmdk_cur += 1
    # Get coordinates
    loc = (p[1], p[2])
    if lmdk_lib.should_sample(samp_rt) or i == 0:
      # Add noise to original data
      new_loc = lmdk_lib.add_polar_noise(loc, bgts[i])
      rls_data[i] = [p[0], new_loc[0], new_loc[1], p[3]]
      # Adjust sampling rate
      if i > 0:
        if distance((rls_data[i - 1][1], rls_data[i - 1][2]), new_loc).km*1000 < 1/bgts[i]:
          # Decrease
          # samp_rt -= samp_rt*.9
          # samp_rt -= samp_rt*.75
          samp_rt -= samp_rt*.5
          # samp_rt -= samp_rt*.25
          # samp_rt -= samp_rt*.1
        else:
          # Increase
          # samp_rt += (1 - samp_rt)*.9
          # samp_rt += (1 - samp_rt)*.75
          samp_rt += (1 - samp_rt)*.5
          # samp_rt += (1 - samp_rt)*.25
          # samp_rt += (1 - samp_rt)*.1
    else:
      skipped += 1
      # Skip current release and approximate with previous
      rls_data[i] = rls_data[i - 1]
      if lmdk_lib.is_landmark(p, lmdks):
        # Allocate the current budget to the following releases uniformly
        for j in range(i + 1, len(seq)):
          bgts[j] += bgts[i]/(len(lmdks) - lmdk_cur + 1)
      # No budget was spent
      bgts[i] = 0
  return rls_data, bgts, skipped
 def skip(seq, lmdks, epsilon):
  '''
    Skip landmarks.
    Parameters:
      seq - The point sequence.
      lmdks - The landmarks.
      epsilon - The available privacy budget.
    Returns:
      rls_data - The perturbed data.
      bgts - The privacy budget allocation.
  '''
  # Event-level budget allocation
  bgts = np.array(len(seq)*[epsilon])
  # Released
  rls_data = [None]*len(seq)
  for i, p in enumerate(seq):
    # Get coordinates
    loc = (p[1], p[2])
    # Add noise to original data
    new_loc = lmdk_lib.add_polar_noise(loc, bgts[i])
    # Check if current point is a landmark
    if lmdk_lib.is_landmark(p, lmdks):
      if i > 0:
        # Approximate with previous location
        new_loc = (rls_data[i - 1][1], rls_data[i - 1][2])
      bgts[i] = 0
    rls_data[i] = [p[0], new_loc[0], new_loc[1], p[3]]
  return rls_data, bgts
 def sample(seq, lmdks, epsilon):
  '''
    Publish randomly.
    Parameters:
      seq - The point sequence.
      lmdks - The landmarks.
      epsilon - The available privacy budget.
    Returns:
      rls_data - The perturbed data.
      bgts - The privacy budget allocation.
      skipped - The number of skipped releases.
  '''
  # Uniform budget allocation
  bgts = uniform(seq, lmdks, epsilon)
  # Released
  rls_data = [None]*len(seq)
  # The sampling rate
  # (publish with 50% chance)
  samp_rt = .5
  # Track landmarks
  lmdk_cur = 0
  # Track skipped releases
  skipped = 0
  for i, p in enumerate(seq):
    # Check if current point is a landmark
    if lmdk_lib.is_landmark(p, lmdks):
      lmdk_cur += 1
    # Get coordinates
    loc = (p[1], p[2])
    if i == 0 or lmdk_lib.should_sample(samp_rt):
      # Add noise to original data
      new_loc = lmdk_lib.add_polar_noise(loc, bgts[i])
      rls_data[i] = [p[0], new_loc[0], new_loc[1], p[3]]
    else:
      skipped += 1
      # Skip current release and approximate with previous
      rls_data[i] = rls_data[i - 1]
      if lmdk_lib.is_landmark(p, lmdks):
        # Allocate the current budget to the following releases uniformly
        for j in range(i + 1, len(seq)):
          bgts[j] += bgts[i]/(len(lmdks) - lmdk_cur + 1)
      # No budget was spent
      bgts[i] = 0
  return rls_data, bgts, skipped
 def uniform_r(seq, lmdks, epsilon):
  # Released
  rls_data = [None]*len(seq)
  # Budgets
  bgts = uniform(seq, lmdks, epsilon)
  for i, p in enumerate(seq):
    # Get coordinates
    loc = (p[1], p[2])
    # Add noise to original data
    new_loc = lmdk_lib.add_polar_noise(loc, bgts[i])
    rls_data[i] = [p[0], new_loc[0], new_loc[1], p[3]]
  return rls_data, bgts
 def utility_analysis(seq, lmdks, o, epsilon):
  '''
    Analyze the utility.
    Parameters:
      seq     - The point sequence.
      lmdks   - The landmarks.
      o       - The perturbed data.
      epsilon - The available privacy budget.
    Returns:
      Nothing.
  '''
  # Estimate Mean Absolute Error
  mae = 0
  # Compare with uniform
  mae_u = 0
  bgts_u = uniform(seq, lmdks, epsilon)
  for i, p in enumerate(seq):
    mae += distance((p[1], p[2]), (o[i][1], o[i][2])).km*1000/len(seq)
    new_loc = lmdk_lib.add_polar_noise((p[1], p[2]), bgts_u[i])
    mae_u += distance((p[1], p[2]), (new_loc[0], new_loc[1])).km*1000/len(seq)
  print(
    '\n########## Analysis ##########\n'
    'MAE uniform : %.2fm\n'
    'MAE current : %.2fm\n'
    'Difference  : %.2f%%\n'
    '##############################\n'
    %(mae_u, mae, 100*(mae - mae_u)/mae_u)
  )
 def mae(seq, o):
  mae = 0
  for i, p in enumerate(seq):
    mae += distance((p[1], p[2]), (o[i][1], o[i][2])).km*1000/len(seq)
  return mae
--- a/code/lib/lmdk_lib.py
+++ b/code/lib/lmdk_lib.py
@ -0,0 +1,967 @@
 #!/usr/bin/env python3
 from datetime import datetime
 from geopy.distance import distance
 import heapq
 import itertools
 from scipy.special import lambertw
 import math
 import numpy as np
 import os
 import random
 import scipy.stats as stats
 import time
 from matplotlib import pyplot as plt
 import zipfile
 # Plot globals
 dpi = 300
 font_size = 24.0
 line_width = 2.0
 marker_size = 14.0
 tick_length = 8.0
 def add_polar_noise(loc, epsilon):
  '''
    Add noise from planar Laplace.
    [https://github.com/chatziko/location-guard/blob/master/src/js/common/laplace.js]
    Parameters:
      loc     - The original location 
    loc     - The original location 
      loc     - The original location 
      epsilon - The privacy budget
    Returns:
      The perturbed location (lat, lng)
  '''
  # The bearing in radians.
  # Random number in [0, 2*pi),
  theta = np.random.uniform(low = 0, high = np.pi*2)
  # The distance in meters.
  # Use the inverse cumulative polar laplacian distribution function.
  r = -np.real((lambertw((np.random.uniform(low = 0, high = 1) - 1)/np.e, k = -1) + 1)/epsilon)
  new_loc = None
  while not is_valid(new_loc):
    new_loc = distance(kilometers = r/1000).destination(point = loc, bearing = np.degrees(theta))
  return new_loc
 def draw_line(line, x, label, marker):
  axis_x = list(range(len(x)))
  # plt.xticks(axis_x, x)
  plt.plot(axis_x, x, label=label, marker=marker)
 def eval_seq(dists):
  '''
    Calculate the standard deviation of a list of distances.
    Parameters:
      dists - A list of distances.
    Returns:
      The standard deviation of the distances.
  '''
  return np.std(dists)
 def get_abs_dists(seq, comb, lmdks):
  '''
    Get the distances of the points in a combination from the start and end of the sequence.
    Parameters:
      seq - The point sequence.
      comb - A possible point combination for a specified size.
      lmdks - The landmarks.
    Returns:
      dists - The distances of the points in a combination.
  '''
  cur = np.append(comb, lmdks)
  cur.sort()
  start = seq[0]
  end = seq[len(seq) - 1]
  dists = np.zeros([len(cur)*2])
  # Check if there are any points
  if cur.size:
    for i in range(0, len(cur)):
      # From the start
      dists[i*2] = abs(cur[i] - start)
      # From the end
      dists[i*2 + 1] = abs(cur[i] - end)
  return dists
 def get_combs(seq, size):
  '''
    Get all the possible landmark point combinations for a specified size.
    Parameters:
      seq - The point sequence.
      size - The desired number of landmarks.
    Returns:
      All the possible landmark combinations for a specified size.
  '''
  return itertools.combinations(seq, size)
 def get_combs_h(h, hist, max):
  combs = []
  for comb in itertools.product(range(h + 1), repeat=len(hist)):
    check = True
    for i, c in enumerate(comb):
      if sum(comb) > max or c < hist[i] or c > h:
        check = False
        break
    if check:
      combs.append(comb)
  return combs
 def get_emd(h1, h2):
  '''
    The earth mover's distance (EMD), or Wasserstein distance, of two histograms.
    [https://stats.stackexchange.com/a/151362]
    Pele et al., The quadratic-chi histogram distance family
    Parameters:
      h1 - A histogram.
      h2 - Another histogram.
    Return:
      The EMD of the histograms.
  '''
  return stats.wasserstein_distance(h1,h2)
  # return stats.wasserstein_distance(get_norm_hist(h1), get_norm_hist(h2))
 def get_hist(seq, lmdks):
  '''
    Create a histogram from a set of landmarks.
    Parameters:
      seq - The point sequence.
      lmdks - The landmarks.
    Return:
      hist - The histogram of landmarks.
      h - The bin size.
  '''
  # # Create a landmark sequence relative to the entire sequence
  # lmdks_rel = np.array(lmdks)
  # # Add the start of the sequence if not in landmarks
  # if not np.any(lmdks_rel[:] == start):
  #   lmdks_rel = np.insert(lmdks_rel, 0, start)
  # # Add the end of the sequence if not in landmarks
  # if not np.any(lmdks_rel[:] == end):
  #   lmdks_rel = np.append(lmdks_rel, end)
  # Dealing with zeros.
  if len(seq) == 0 or len(lmdks) == 0:
    return np.zeros(math.ceil(max(seq))), 1
  # Interquartile range (IQR) is a measure of statistical dispersion, being equal to the difference between 75th and 25th percentiles, or between upper and lower quartiles.
  # https://en.wikipedia.org/wiki/Interquartile_range
  iqr = stats.iqr(lmdks)
  # Use the Freedman–Diaconis rule to select the width of the bins to be used in the histogram.
  # Robust (resilient to outliers) estimator that takes into account data variability and data size.
  # https://en.wikipedia.org/wiki/Freedman%E2%80%93Diaconis_rule
  h = 2*iqr*len(lmdks)**(-1/3)
  if h < 1:
    h = 1
  # On the number of bins and width:
  # https://en.wikipedia.org/wiki/Histogram#Number_of_bins_and_width
  # Normalize the interval
  h = math.ceil(max(seq)/math.ceil(max(seq)/math.ceil(h)))
  # Create an empty histogram with intervals of size h
  hist = np.zeros(math.ceil(max(seq)/h))
  for lmdk in lmdks:
    hist[int(lmdk/h) - 1] += 1
  return hist, h
 def get_lmdks(seq, size, dist):
  '''
    Get a landmark set for a distribution type.
    Parameters:
      seq  - The point sequence.
      size - The landmarks' size.
      dist - The distribution type code.
            -1 - Left-skewed
              0 - Symmetric
            +1 - Right-skewed
            +2 - Bimodal
            +3 - Uniform
    Return:
      hist - The histogram of landmarks.
      h - The bin size.
  '''
  scale = len(seq)/10
  if dist == -1 or dist == 0 or dist == 1:
    return MixtureModel([truncnorm(min(seq), max(seq), get_loc(seq, dist), scale)]).sample(min(seq), max(seq), [], size)
  elif dist == 2:
    return MixtureModel([
      truncnorm(min(seq), max(seq), get_loc(seq, -1), scale),
      truncnorm(min(seq), max(seq), get_loc(seq, +1), scale)
    ]).sample(min(seq), max(seq), [], size)
  else:
    return np.sort(np.random.choice(np.arange(min(seq), max(seq) + 1, 1), size=size, replace=False))
 def get_lmdks_rand(seq, size, skew):
  '''
    Get a landmark set with a given skewness.
    Parameters:
      seq - The point sequence.
      size - The landmarks' size.
      skew - The desired skewness.
    Return:
      hist - The histogram of landmarks.
      h - The bin size.
  '''
  # Get all possible landmark combinations
  lmdk_combs = get_combs(seq, size)
  # Find a set of landmarks for the given requirements
  lmdks = ()
  for comb in lmdk_combs:
    skew_cur = get_skew(comb)
    # Check if skewness is close to the requirement or have the same sign
    if math.isclose(skew_cur, skew, rel_tol=.1) or ((skew_cur < 0) == (skew < 0)):
      lmdks = comb
      break
  return lmdks
 def get_loc(seq, skew):
  '''
    Get the location of the sequence with a given skewness.
    Parameters:
      seq  - The point sequence.
      skew - The desired skewness.
    Return:
      The location.
  '''
  loc_min = min(seq)
  loc_max = max(seq)
  if skew < 0:
    loc_min = (max(seq) - min(seq))/2
  elif skew > 0:
    loc_max = (max(seq) - min(seq))/2
  return int(loc_min + (loc_max - loc_min)/2)
 def get_mean(seq, hist, h):
  '''
    Get the mean of the histogram.
    Parameters:
      seq - The point sequence.
      hist - The histogram of landmarks.
      h - The bin size.
    Return:
      The mean of the histogram.
  '''
  sum = 0
  for idx, count in enumerate(hist):
    # Find bin limits
    start = min(seq) + h*idx
    end = min(seq) + h*(idx + 1)
    if(end > max(seq)):
      end = max(seq)
    sum += (start + (end - start)/2)*count
  return sum/np.sum(hist)
 def get_norm(h1, h2):
  '''
    The Euclidean distance of two histograms.
    [https://stackoverflow.com/a/1401828/13123075]
    Parameters:
      h1 - A histogram.
      h2 - Another histogram.
    Return:
      The Euclidean distance of the histograms.
  '''
  return np.linalg.norm(h1 - h2)
 def get_norm_hist(hist):
  '''
    Normalize the histogram.
    Parameters:
      hist - The original histogram.
    Return:
      The normalized histogram.
  '''
  # In-place type conversion
  norm_hist = hist.astype(np.float32)
  n = np.sum(norm_hist)
  for i, a in enumerate(norm_hist):
    if a:
      norm_hist[i] = a/n
  return norm_hist
 def get_opts(seq, lmdks):
  '''
    Finds all the possible valid options.
    Parameters:
      seq - The point sequence.
      lmdks - The landmarks.
    Returns:
      A list with all the possible options.
    Requirements:
      n = Regular points
      r = The size of a combination
      Time  - O(C(n, r) + 2^C(n, r)), because O(n*C(n, r)) = O(n*n^min(r, n - r))
      Space - O(r*C(n, r))
  '''
  reg = get_reg(seq, lmdks)
  # Find all possible combinations for every k
  combs = []
  for k in range(len(lmdks) + 1, seq[len(seq) - 1] + 1):
    combs.append(get_combs(reg, k - len(lmdks)))
  # Find all possible options for all combinations
  opts = itertools.product(*combs)
  # Keep only valid options, i.e., larger sets must contain every element of every smaller set
  rslts = []
  for opt in opts:
    for i, _ in enumerate(opt):
      if i and not set(opt[i - 1]).issubset(opt[i]):
        break
    if i == len(opt) - 1:
      rslts.append(opt)
  return rslts
 def get_reg(seq, lmdks):
  '''
    Get the regular points, i.e., non-landmarks, in a sequence.
    Parameters:
      seq - The point sequence.
      lmdks - The landmarks.
    Returns:
      The non-landmark points in a sequence.
  '''
  return np.array([i for i in seq if i not in lmdks])
 def get_rel_dists(seq, comb, lmdks):
  '''
    Get the distances of the points in a combination from the nearest point of reference.
    That is, the previous/next point or the start/end if closer.
    Parameters:
      seq - The point sequence.
      comb - A possible point combination for a specified size.
      lmdks - The landmarks.
    Returns:
      dists - The distances of the points in a combination.
  '''
  # TODO: Review distances. Maybe take into account both ways?
  cur = np.append(comb, lmdks)
  cur.sort()
  start = seq[0]
  end = seq[len(seq) - 1]
  dists = np.zeros([len(cur) + 1])
  # Check if there are any points
  if cur.size:
    # First
    dists[0] = abs(cur[0] - start)
    # In-between
    for i in range(0, len(cur) - 1):
      dists[i + 1] = abs(cur[i] - cur[i + 1])
    # Last
    dists[len(cur)] = abs(cur[len(cur) - 1] - end)
  return dists
 def get_seq(start, end):
  '''
    Get a sequence of points.
    Parameters:
      start - The starting point of the sequence.
      end - The ending point of the sequence.
    Returns:
      A point sequence.
  '''
  return np.arange(start, end + 1)
 def get_shannon_ent(a):
  '''
    Get the Shannon entropy of an array.
    It can be used to calculate the uniformity of a histogram.
    Uniform probability yields maximum uncertainty and therefore maximum entropy.
    Entropy, then, can only decrease from the value associated with uniform probability.
    Parameters:
      a - An array of integers
    Return:
      The Shannon entropy.
  '''
  # Keep only the non-zero elements of array a because log2(0) is -inf.
  # The entropy of an event with zero probability is 0.
  a = a[a != 0]
  # Histograms are discrete probability distributions.
  # We convert the counts to probabilities.
  p_a = a/np.sum(a)
  # Base 2 gives the unit of bits (or 'shannons').
  return -np.sum(p_a*np.log2(p_a))
 def get_shannon_ent_norm(a):
  '''
    Get the normalized Shannon entropy of an array.
    It ranges from 0 (high entropy - close to uniform) to 1 (low entropy). 
  It ranges from 0 (high entropy - close to uniform) to 1 (low entropy). 
    It ranges from 0 (high entropy - close to uniform) to 1 (low entropy). 
    Parameters:
      a - An array of integers
    Return:
      The normalized Shannon entropy.
  '''
  return 1 - get_shannon_ent(a)/np.log2(len(a))
 def get_skew(lmdks):
  '''
    Fisher-Pearson coefficient of skewness.
    negative - left (more data items right)
    zero     - symmetric data
    positive - right (more data items left) 
  positive - right (more data items left) 
    positive - right (more data items left) 
    Parameters:
      seq - The point sequence.
      hist - The histogram of landmarks.
      h - The bin size.
    Return:
      Fisher-Pearson coefficient of skewness.
  '''
 # def get_skew(seq, hist, h):
 #   return get_third(seq, hist, h)/get_std(seq, hist, h)**3
  return stats.skew(lmdks)
 def get_std(seq, hist, h):
  '''
    Get the standard deviation.
    Parameters:
      seq - The point sequence.
      hist - The histogram of landmarks.
      h - The bin size.
    Return:
      The standard deviation.
  '''
  hist_mean = get_mean(seq, hist, h)
  sum = 0
  for idx, count in enumerate(hist):
    # Find bin limits
    start = min(seq) + h*idx
    end = min(seq) + h*(idx + 1)
    if(end > max(seq)):
      end = max(seq)
    sum += (((start + (end - start)/2) - hist_mean)**2)*count
  return sum/np.sum(hist)
 def get_third(seq, hist, h):
  '''
    Get the third central moment.
    Parameters:
      seq - The point sequence.
      hist - The histogram of landmarks.
      h - The bin size.
    Return:
      The third central moment.
  '''
  hist_mean = get_mean(seq, hist, h)
  sum = 0
  for idx, count in enumerate(hist):
    # Find bin limits
    start = min(seq) + h*idx
    end = min(seq) + h*(idx + 1)
    if(end > max(seq)):
      end = max(seq)
    sum += (((start + (end - start)/2) - hist_mean)**3)*count
  return sum/np.sum(hist)
 def dist_type_to_str(d):
  '''
    Convert the distribution type code to string.
    Parameters:
      t - The distribution type code.
          -1 - Left-skewed
          0 - Symmetric
          +1 - Right-skewed
          +2 - Bimodal
          +3 - Uniform
    Return:
      The distribution type.
  '''
  if d == -1:
    return 'Left-skewed'
  elif d == 0:
    return 'Symmetric'
  elif d == 1:
    return 'Right-skewed'
  elif d == 2:
    return 'Bimodal'
  elif d == 3:
    return 'Uniform'
  else:
    return 'Undefined'
 class MixtureModel(stats.rv_continuous):
  '''
    A mixture model of continuous probability distributions
    [https://stackoverflow.com/a/47763145/13123075]
  '''
  def __init__(self, submodels, *args, **kwargs):
    super().__init__(*args, **kwargs)
    self.submodels = submodels
  def _pdf(self, x):
    pdf = self.submodels[0].pdf(x)
    for submodel in self.submodels[1:]:
      pdf += submodel.pdf(x)
    pdf /= len(self.submodels)
    return pdf
  def rvs(self, size):
    submodel_choices = np.random.randint(len(self.submodels), size=size)
    submodel_samples = [submodel.rvs(size=size) for submodel in self.submodels]
    rvs = np.choose(submodel_choices, submodel_samples)
    return rvs
  def sample(self, lower, upper, sample, size):
    '''
      Sample from the mixture model without replacement.
      [https://stackoverflow.com/a/20548895/13123075]
      Parameters:
        self   - The mixture model.
        lower  - The lower bound of the range.
        upper  - The upper bound of the range.
        sample - Items to be excluded.
        size   - The sample size.
      Returns:
        A sample.
    '''
    w = self.pdf(range(int(lower), int(upper) + 1))
    idx = []
    for i in range(int(lower), int(upper) + 1):
      if i not in sample:
        idx.append(i)
    elt = [(math.log(random.random()) / w[i - 1], i) for i in idx]
    return np.sort([x[1] for x in heapq.nlargest(size, elt)])
 def plot_init():
  '''
    Initialize the plot.
  '''
  # Reset
  plt.close('all')
  # Style
  plt.style.use('classic')
  # DPI
  plt.figure(dpi=dpi)
  # Font
  plt.rc('font', family='sans-serif')
  plt.rc('font', **{'sans-serif':['Liberation Sans']})
  plt.rc('font', size=font_size)
  # Grid
  plt.setp(plt.figure().add_subplot(111).spines.values(), linewidth=line_width)
  plt.grid(True, axis='y', linewidth=line_width)
  # Ticks
  plt.gca().tick_params(which='both', width=line_width)
  plt.gca().tick_params(which='major', length=tick_length)
  plt.gca().tick_params(which='minor', length=tick_length/2)
  # Colors
  plt.subplot(111).set_prop_cycle('color', ['#212121', '#616161', '#9e9e9e', '#bdbdbd', '#e0e0e0', 'f5f5f5'])
  # Layout
  plt.tight_layout()
 def plot_legend():
  '''
    Initialize the plot legend.
  '''
  plt.legend(
    loc='best',
    fontsize=font_size,
    numpoints=1,
    borderpad=.2,      # default: 0.4
    labelspacing=.25,  # default: 0.5
    handlelength=2.0,  # default: 2.0
    handletextpad=.4,  # default: 0.8
    borderaxespad=.5,  # default: 0.5
  )
 def save_plot(path):
  '''
    Save the plot to a file.
    Parameters:
      path - The desired location.
    Returns:
      Nothing.
  '''
  # Save plot
  os.makedirs(os.path.dirname(path), exist_ok=True)
  plt.savefig(path, bbox_inches='tight')
  # Clean up
  plt.clf()
  plt.close()
 def print_lmdks(seq, lmdks):
  '''
    Print the landmarks.
    Parameters:
      seq - The point sequence.
      lmdks - The landmarks.
    Returns:
      Nothing.
  '''
  print('\n######### Landmarks ##########')
  print(lmdks)
  print('[', end='', flush=True)
  for p in seq:
    if np.any(lmdks[:] == p):
      print('\'', end='', flush=True)
    else:
      print('.', end='', flush=True)
  print(']', end='', flush=True)
  print('\n##############################\n')
 def plot_dist(seq, dist):
  '''
    Plot the probability distribution.
    Parameters:
      seq  - The point sequence.
      size - The landmarks' size.
      dist - The distribution type code.
            -1 - Left-skewed
              0 - Symmetric
            +1 - Right-skewed
            +2 - Bimodal
            +3 - Uniform
    Returns:
      Nothing.
  '''
  scale = len(seq)/10
  p = stats.uniform.pdf(seq)
  if dist == -1 or dist == 0 or dist == 1:
    p = MixtureModel([truncnorm(min(seq), max(seq), get_loc(seq, dist), scale)]).pdf(seq)
  elif dist == 2:
    p = MixtureModel([
      truncnorm(min(seq), max(seq), get_loc(seq, -1), scale),
      truncnorm(min(seq), max(seq), get_loc(seq, +1), scale)
    ]).pdf(seq)
  plt.plot(seq, p)
  plt.show()
 def simplify_data(seq, lmdks):
  '''
    Get synthetic from real data.
    Parameters:
      seq   - The trajectory.
      lmdks - The landmarks.
    Returns:
      The simplified sequence and landmarks.
  '''
  seq_s = get_seq(1, len(seq))
  lmdks_s = []
  for i, p in enumerate(seq):
    if is_landmark(p, lmdks):
      lmdks_s.append(i + 1)
  return seq_s, np.array(lmdks_s)
 def truncnorm(lower, upper, loc, scale):
  '''
    A truncated normal continuous random variable.
    Parameters:
      lower - The lower bound of the range.
      upper - The upper bound of the range.
      loc - The location of the distribution.
      scale - The spread of the distribution.
    Returns:
      A truncated normal continuous random variable.
  '''
  return stats.truncnorm((lower - loc)/scale, (upper - loc)/scale, loc, scale)
 def is_valid(coord):
  '''
    Check if coordinates are valid.
    Parameters:
      coord - The coordinates to be checked.
    Returns:
      True  - Valid coordinates.
      False - Invalid coordinates.
  '''
  if coord is None or (float(coord[0]) > 90 or float(coord[0]) < -90  # lat
  or float(coord[1]) > 180 or float(coord[1]) < -180):  #lng
    return False
  return True
 def save_data(args, data, name):
  '''
    Save to file and compress.
    Parameters:
      args - The execution arguments.
      data - The data.
      name - The name.
    Returns:
      Nothing.
  '''
  try:
    path = os.path.dirname(os.path.abspath(args.res)) + '/' + name
    if 'lmdks' in name:
      path += '(dis=' + str(args.dist) + ', per=' + str(args.per) + ')'
    print('Saving to %s... ' %(path), end='', flush=True)
    with open(path, 'wb') as file:
      _, idx = np.unique(data, axis=0, return_index=True)
      np.save(file, data[np.sort(idx)], allow_pickle=False)
    print('[OK]')
    # Compress file
    print('Compressing in %s... ' %(args.res), end='', flush=True)
    with zipfile.ZipFile(args.res, 'a') as zip_file:
      zip_file.write(path, os.path.basename(path))
    print('[OK]')
    # Delete file
    print('Deleting %s... ' %(path), end='', flush=True)
    os.remove(path)
    print('[OK]')
  except Exception as e:
    print('[Error: %s]' %(e))
 def load_data(args, name):
  '''
    Load data from file.
    Parameters:
      args - The execution arguments.
      data - The data.
      name - The name.
    Returns:
      data - The data set.
             0: uid, 1: lat, 2: lng, 3: tim
  '''
  data = np.array([])
  try:
    path = os.path.dirname(os.path.abspath(args.res)) + '/' + name
    if 'lmdks' in name:
      path += '(dis=' + str(args.dist) + ', per=' + str(args.per) + ')'
    print('Loading %s... ' %(path), end='', flush=True)
    with zipfile.ZipFile(args.res, 'r') as zip_file:
      data = np.load(zip_file.open(os.path.basename(path)))
      print('[OK]')
  except Exception as e:
    print('[Error: %s]' %(e))
  return data
 def find_lmdks(usrs_data, args):
  '''
    Find users' landmarks.
    Parameters:
      args      - The execution arguments.
      usrs_data - The users' data.
                  0: uid, 1: lat, 2: lng, 3: tim
    Returns:
      usrs_lmdks - The users' landmarks.
                  0: lid, 1: uid, 2: lat, 3: lng, 4: tim
  '''
  usrs_lmdks = np.empty((0,5), np.float32)
  traj_cur = 0
  lmdk_id = 0
  usrs = np.unique(usrs_data[:,0])
  for usr_i, usr in enumerate(usrs):
    # Initialize user's landmarks list
    lmdks = []
    traj = usrs_data[usrs_data[:,0]==usr, :]
    traj_cur += len(traj)
    print(
      '[%d%%] Points: %d/%d | Users: %d/%d... '
      %((traj_cur/usrs_data.shape[0])*100, traj_cur, usrs_data.shape[0], usr, usrs[len(usrs) - 1]),
      end='', flush=True
    )
    # Check the actual points
    i = 0
    while i < len(traj) - 1:
      lmdk_cur = []
      if is_valid((traj[i][1], traj[i][2])):
        for j in range(i + 1, len(traj)):
          if is_valid((traj[j][1], traj[j][2])):
            # Add the beginning only the first time
            if j == i + 1:
              lmdk_cur.append([lmdk_id, usr, traj[i][1], traj[i][2], traj[i][3]])
            # Add the new point
            lmdk_cur.append([lmdk_id, usr, traj[j][1], traj[j][2], traj[j][3]])
            # Distance in meters
            dist = distance((traj[i][1], traj[i][2]), (traj[j][1], traj[j][2])).km*1000
            # Distance exceeded or reached end of iteration
            if dist > args.dist or j == len(traj) - 1:
              # Period in minutes
              per = abs(datetime.fromtimestamp(int(traj[i][3])) - datetime.fromtimestamp(int(traj[j][3]))).total_seconds()/60
              # Check if enough time passed
              if per > args.per:
                # usrs_id starts from 1
                lmdk_id += 1
                # Assign id to current landmark
                for l in lmdk_cur:
                  l[0] = lmdk_id
                # Append current landmark
                lmdks += lmdk_cur
              # Continue checking from the current point
              i = j
              break
      # No landmark was found, continue from next point
      if i == 0 or not is_valid((traj[i][1], traj[i][2])) or 'j' not in vars() or i != j:
        i += 1
    print('[OK]')
    if lmdks:
      usrs_lmdks = np.append(usrs_lmdks, np.asarray(lmdks, dtype=np.float32), axis=0)
  return usrs_lmdks
 def lmdks_stats(args, usrs_lmdks):
  '''
    Generate landmarks' stats.
    Parameters:
      args       - The execution arguments.
      usrs_lmdks - The user' landmarks.
                  0: lid, 1: uid, 2: lat, 3: lng, 4: tim
    Returns:
      Nothing.
  '''
  lmdks_stats = {
    'total': 0,
    'len': 0,
    'count': {},
  }
  # Check every user
  usrs = np.unique(usrs_lmdks[:, 1])
  for usr in usrs:
    print(
      '[%d%%] Calculating landmark stats for user %d/%d... ' 
      %(usr*100/len(usrs), usr, np.max(usrs)),
      end='', flush=True
    )
    # Check each user's landmarks
    for lmdk in np.unique(usrs_lmdks[usrs_lmdks[:, 1] == usr, 0]):
      lmdk_traj = usrs_lmdks[usrs_lmdks[:, 0] == lmdk, :]
      lmdks_stats['total'] += 1
      lmdks_stats['len'] += len(lmdk_traj)
      if lmdks_stats['count'].get(len(lmdk_traj)) == None:
        lmdks_stats['count'][len(lmdk_traj)] = 1
      else:
        lmdks_stats['count'][len(lmdk_traj)] += 1
    print('[OK]')
  hist_min, hist_max = min(lmdks_stats['count'], key=int, default=0), max(lmdks_stats['count'], key=int,  default=0)
  # Make histogram
  x = []
  for i in range(hist_min, hist_max + 1):
    if lmdks_stats['count'].get(i) != None:
      x.extend([i]*lmdks_stats['count'].get(i))
  # Show stats
  print(
    '\n############ Stats ###########\n'
    'Landmarks   : %d\n'
    '  Length\n'
    '    Total   : %d (%.2f%%)\n'
    '    Minimum : %d\n'
    '    Maximum : %d\n'
    '##############################\n'
    %(lmdks_stats['total'], lmdks_stats['len'], (lmdks_stats['len']/args.time)*100, min(lmdks_stats['count'], key=int, default=0), max(lmdks_stats['count'], key=int, default=0))
  )
  # # Initialize plot
  # plot_init()
  # # Set x axis
  # plt.xlabel('Landmarks sequence length')  
  # plt.xticks(rotation='vertical')
  # # The y axis
  # plt.ylabel('Number of sequences')
  # plt.yscale('log')
  # Create histogram
  # plt.hist(x, bins=(hist_max - hist_min))
  # Show plot
  # plt.show()
  # Save plot
  # save_plot(os.path.dirname(args.arc) + '/' + 'results' + '(dis=' + str(args.dist) + ', per=' + str(args.per) + ').pdf')
 def should_sample(samp_rt):
  '''
    Randomly decide to release with noise.
    Parameters:
      samp_rt - The sampling rate (0, 1]
    Returns:
      True/False
  '''
  return random.choices(population=[True, False], weights=[samp_rt, 1 - samp_rt], k=1)[0]
 def is_landmark(p, lmdks):
  '''
    Check is a point is a landmark.
    Parameters:
      p     - The point to check.
      lmdks - A 2d array of landmarks.
    Returns:
      True/False
  '''
  if len(lmdks) and any(np.equal(lmdks[:, 1:5], p).all(1)):
    return True
  return False