the-last-thing/code/lib/lmdk_lib.py

#!/usr/bin/env python3

import ast
from datetime import datetime
from geopy.distance import distance
import heapq
import itertools
from scipy.special import lambertw
import math
import numpy as np
import os
import random
import scipy.stats as stats
import time
from matplotlib import pyplot as plt
import zipfile


# Plot globals
dpi = 300
font_size = 24.0
line_width = 2.0
marker_size = 14.0
tick_length = 8.0


def add_polar_noise(loc, epsilon):
  '''
    Add noise from planar Laplace.
    [https://github.com/chatziko/location-guard/blob/master/src/js/common/laplace.js]

    Parameters:
      loc     - The original location.
      epsilon - The privacy budget.
    Returns:
      The perturbed location (lat, lng).
  '''
  # The bearing in radians.
  # Random number in [0, 2*pi),
  theta = np.random.uniform(low = 0, high = np.pi*2)
  # The distance in meters.
  # Use the inverse cumulative polar laplacian distribution function.
  r = -np.real((lambertw((np.random.uniform(low = 0, high = 1) - 1)/np.e, k = -1) + 1)/epsilon)
  new_loc = None
  while not is_valid(new_loc):
    new_loc = distance(kilometers = r/1000).destination(point = loc, bearing = np.degrees(theta))
  return new_loc


def randomized_response(r, epsilon):
  '''
    Return the original response with probability p,
    or flip the response with probability 1 - p.
    [https://www.usenix.org/system/files/conference/usenixsecurity17/sec17-wang-tianhao.pdf]

    Parameters:
      r       - The original response (True/False).
      epsilon - The privacy budget.
    Returns:
      The randomized response.
  '''
  # Return the original value
  if random.uniform(0, 1) <= math.exp(epsilon)/(math.exp(epsilon) + 1):
    return r
  # Flip the original value
  else:
    return not r


def add_laplace_noise(value, sens, epsilon):
  '''
    Add random noise to a data value.

    Parameteres:
      value   - The value to add noise to.
      sens    - The query function sensitivity.
      epsilon - The privacy budget.
    Returns:
      The value with the noise.
  '''
  return np.random.laplace(value, sens/epsilon)


def draw_line(line, x, label, marker):
  axis_x = list(range(len(x)))
  # plt.xticks(axis_x, x)
  plt.plot(axis_x, x, label=label, marker=marker)


def eval_seq(dists):
  '''
    Calculate the standard deviation of a list of distances.

    Parameters:
      dists - A list of distances.
    Returns:
      The standard deviation of the distances.
  '''
  return np.std(dists)


def get_abs_dists(seq, comb, lmdks):
  '''
    Get the distances of the points in a combination from the start and end of the sequence.

    Parameters:
      seq - The point sequence.
      comb - A possible point combination for a specified size.
      lmdks - The landmarks.
    Returns:
      dists - The distances of the points in a combination.
  '''
  cur = np.append(comb, lmdks)
  cur.sort()
  start = seq[0]
  end = seq[len(seq) - 1]
  dists = np.zeros([len(cur)*2])
  # Check if there are any points
  if cur.size:
    for i in range(0, len(cur)):
      # From the start
      dists[i*2] = abs(cur[i] - start)
      # From the end
      dists[i*2 + 1] = abs(cur[i] - end)
  return dists


def get_combs(seq, size):
  '''
    Get all the possible landmark point combinations for a specified size.

    Parameters:
      seq - The point sequence.
      size - The desired number of landmarks.
    Returns:
      All the possible landmark combinations for a specified size.
  '''
  return itertools.combinations(seq, size)

def get_combs_h(h, hist, max):
  combs = []
  for comb in itertools.product(range(h + 1), repeat=len(hist)):
    check = True
    for i, c in enumerate(comb):
      if sum(comb) > max or c < hist[i] or c > h:
        check = False
        break
    if check:
      combs.append(comb)
  return combs


def get_emd(h1, h2):
  '''
    The earth mover's distance (EMD), or Wasserstein distance, of two histograms.
    [https://stats.stackexchange.com/a/151362]
    Pele et al., The quadratic-chi histogram distance family

    Parameters:
      h1 - A histogram.
      h2 - Another histogram.
    Return:
      The EMD of the histograms.
  '''
  return stats.wasserstein_distance(h1,h2)
  # return stats.wasserstein_distance(get_norm_hist(h1), get_norm_hist(h2))


def get_hist(seq, lmdks):
  '''
    Create a histogram from a set of landmarks.

    Parameters:
      seq - The point sequence.
      lmdks - The landmarks.
    Return:
      hist - The histogram of landmarks.
      h - The bin size.
  '''
  # # Create a landmark sequence relative to the entire sequence
  # lmdks_rel = np.array(lmdks)
  # # Add the start of the sequence if not in landmarks
  # if not np.any(lmdks_rel[:] == start):
  #   lmdks_rel = np.insert(lmdks_rel, 0, start)
  # # Add the end of the sequence if not in landmarks
  # if not np.any(lmdks_rel[:] == end):
  #   lmdks_rel = np.append(lmdks_rel, end)

  # Dealing with zeros.
  if len(seq) == 0:
    return np.zeros(math.ceil(max(seq))), 1
  elif len(lmdks) == 0:
    return np.zeros(1), len(seq)

  # Interquartile range (IQR) is a measure of statistical dispersion, being equal to the difference between 75th and 25th percentiles, or between upper and lower quartiles.
  # https://en.wikipedia.org/wiki/Interquartile_range
  iqr = stats.iqr(lmdks)

  # Use the Freedman–Diaconis rule to select the width of the bins to be used in the histogram.
  # Robust (resilient to outliers) estimator that takes into account data variability and data size.
  # https://en.wikipedia.org/wiki/Freedman%E2%80%93Diaconis_rule
  h = 2*iqr*len(lmdks)**(-1/3)
  if h < 1:
    h = 1
  # On the number of bins and width:
  # https://en.wikipedia.org/wiki/Histogram#Number_of_bins_and_width

  # Normalize the interval
  h = math.ceil(max(seq)/math.ceil(max(seq)/math.ceil(h)))

  # Create an empty histogram with intervals of size h
  hist = np.zeros(math.ceil(max(seq)/h), dtype = int)

  for lmdk in lmdks:
    hist[int(lmdk/h) - 1] += 1

  return hist, h


def get_lmdks(seq, size, dist):
  '''
    Get a landmark set for a distribution type.

    Parameters:
      seq  - The point sequence.
      size - The landmarks' size.
      dist - The distribution type code.
            -1 - Left-skewed
              0 - Symmetric
            +1 - Right-skewed
            +2 - Bimodal
            +3 - Uniform
    Return:
      hist - The histogram of landmarks.
      h - The bin size.
  '''
  scale = len(seq)/10
  if dist == -1 or dist == 0 or dist == 1:
    return MixtureModel([truncnorm(min(seq), max(seq), get_loc(seq, dist), scale)]).sample(min(seq), max(seq), [], size)
  elif dist == 2:
    return MixtureModel([
      truncnorm(min(seq), max(seq), get_loc(seq, -1), scale),
      truncnorm(min(seq), max(seq), get_loc(seq, +1), scale)
    ]).sample(min(seq), max(seq), [], size)
  else:
    return np.sort(np.random.choice(np.arange(min(seq), max(seq) + 1, 1), size=size, replace=False))


def get_lmdks_rand(seq, size, skew):
  '''
    Get a landmark set with a given skewness.

    Parameters:
      seq - The point sequence.
      size - The landmarks' size.
      skew - The desired skewness.
    Return:
      hist - The histogram of landmarks.
      h - The bin size.
  '''
  # Get all possible landmark combinations
  lmdk_combs = get_combs(seq, size)
  # Find a set of landmarks for the given requirements
  lmdks = ()
  for comb in lmdk_combs:
    skew_cur = get_skew(comb)
    # Check if skewness is close to the requirement or have the same sign
    if math.isclose(skew_cur, skew, rel_tol=.1) or ((skew_cur < 0) == (skew < 0)):
      lmdks = comb
      break
  return lmdks


def get_loc(seq, skew):
  '''
    Get the location of the sequence with a given skewness.

    Parameters:
      seq  - The point sequence.
      skew - The desired skewness.
    Return:
      The location.
  '''
  loc_min = min(seq)
  loc_max = max(seq)
  if skew < 0:
    loc_min = (max(seq) - min(seq))/2
  elif skew > 0:
    loc_max = (max(seq) - min(seq))/2
  return int(loc_min + (loc_max - loc_min)/2)


def get_mean(seq, hist, h):
  '''
    Get the mean of the histogram.

    Parameters:
      seq - The point sequence.
      hist - The histogram of landmarks.
      h - The bin size.
    Return:
      The mean of the histogram.
  '''
  sum = 0
  for idx, count in enumerate(hist):
    # Find bin limits
    start = min(seq) + h*idx
    end = min(seq) + h*(idx + 1)
    if(end > max(seq)):
      end = max(seq)

    sum += (start + (end - start)/2)*count

  return sum/np.sum(hist)


def get_norm(h1, h2):
  '''
    The Euclidean distance of two histograms.
    [https://stackoverflow.com/a/1401828/13123075]

    Parameters:
      h1 - A histogram.
      h2 - Another histogram.
    Return:
      The Euclidean distance of the histograms.
  '''
  return np.linalg.norm(h1 - h2)


def get_norm_hist(hist):
  '''
    Normalize the histogram.

    Parameters:
      hist - The original histogram.
    Return:
      The normalized histogram.
  '''
  # In-place type conversion
  norm_hist = hist.astype(np.float32)
  n = np.sum(norm_hist)
  for i, a in enumerate(norm_hist):
    if a:
      norm_hist[i] = a/n
  return norm_hist


def get_opts(seq, lmdks):
  '''
    Finds all the possible valid options.

    Parameters:
      seq - The point sequence.
      lmdks - The landmarks.
    Returns:
      A list with all the possible options.

    Requirements:
      n = Regular points
      r = The size of a combination
      Time  - O(C(n, r) + 2^C(n, r)), because O(n*C(n, r)) = O(n*n^min(r, n - r))
      Space - O(r*C(n, r))
  '''
  reg = get_reg(seq, lmdks)
  # Find all possible combinations for every k
  combs = []
  for k in range(len(lmdks) + 1, seq[len(seq) - 1] + 1):
    combs.append(get_combs(reg, k - len(lmdks)))

  # Find all possible options for all combinations
  opts = itertools.product(*combs)

  # Keep only valid options, i.e., larger sets must contain every element of every smaller set
  rslts = []
  for opt in opts:
    for i, _ in enumerate(opt):
      if i and not set(opt[i - 1]).issubset(opt[i]):
        break
    if i == len(opt) - 1:
      rslts.append(opt)

  return rslts


def get_reg(seq, lmdks):
  '''
    Get the regular points, i.e., non-landmarks, in a sequence.

    Parameters:
      seq - The point sequence.
      lmdks - The landmarks.
    Returns:
      The non-landmark points in a sequence.
  '''
  return np.array([i for i in seq if i not in lmdks])


def get_rel_dists(seq, comb, lmdks):
  '''
    Get the distances of the points in a combination from the nearest point of reference.
    That is, the previous/next point or the start/end if closer.

    Parameters:
      seq - The point sequence.
      comb - A possible point combination for a specified size.
      lmdks - The landmarks.
    Returns:
      dists - The distances of the points in a combination.
  '''
  # TODO: Review distances. Maybe take into account both ways?
  cur = np.append(comb, lmdks)
  cur.sort()
  start = seq[0]
  end = seq[len(seq) - 1]
  dists = np.zeros([len(cur) + 1])
  # Check if there are any points
  if cur.size:
    # First
    dists[0] = abs(cur[0] - start)
    # In-between
    for i in range(0, len(cur) - 1):
      dists[i + 1] = abs(cur[i] - cur[i + 1])
    # Last
    dists[len(cur)] = abs(cur[len(cur) - 1] - end)
  return dists


def get_seq(start, end):
  '''
    Get a sequence of points.

    Parameters:
      start - The starting point of the sequence.
      end - The ending point of the sequence.
    Returns:
      A point sequence.
  '''
  return np.arange(start, end + 1)


def get_shannon_ent(a):
  '''
    Get the Shannon entropy of an array.
    It can be used to calculate the uniformity of a histogram.
    Uniform probability yields maximum uncertainty and therefore maximum entropy.
    Entropy, then, can only decrease from the value associated with uniform probability.

    Parameters:
      a - An array of integers
    Return:
      The Shannon entropy.
  '''
  # Keep only the non-zero elements of array a because log2(0) is -inf.
  # The entropy of an event with zero probability is 0.
  a = a[a != 0]
  # Histograms are discrete probability distributions.
  # We convert the counts to probabilities.
  p_a = a/np.sum(a)
  # Base 2 gives the unit of bits (or 'shannons').
  return -np.sum(p_a*np.log2(p_a))


def get_shannon_ent_norm(a):
  '''
    Get the normalized Shannon entropy of an array.
    It ranges from 0 (high entropy - close to uniform) to 1 (low entropy).
  It ranges from 0 (high entropy - close to uniform) to 1 (low entropy).
    It ranges from 0 (high entropy - close to uniform) to 1 (low entropy).

    Parameters:
      a - An array of integers
    Return:
      The normalized Shannon entropy.
  '''
  return 1 - get_shannon_ent(a)/np.log2(len(a))


def get_skew(lmdks):
  '''
    Fisher-Pearson coefficient of skewness.
    negative - left (more data items right)
    zero     - symmetric data
    positive - right (more data items left)
  positive - right (more data items left)
    positive - right (more data items left)

    Parameters:
      seq - The point sequence.
      hist - The histogram of landmarks.
      h - The bin size.
    Return:
      Fisher-Pearson coefficient of skewness.
  '''
# def get_skew(seq, hist, h):
#   return get_third(seq, hist, h)/get_std(seq, hist, h)**3
  return stats.skew(lmdks)


def get_std(seq, hist, h):
  '''
    Get the standard deviation.

    Parameters:
      seq - The point sequence.
      hist - The histogram of landmarks.
      h - The bin size.
    Return:
      The standard deviation.
  '''
  hist_mean = get_mean(seq, hist, h)

  sum = 0
  for idx, count in enumerate(hist):
    # Find bin limits
    start = min(seq) + h*idx
    end = min(seq) + h*(idx + 1)
    if(end > max(seq)):
      end = max(seq)

    sum += (((start + (end - start)/2) - hist_mean)**2)*count

  return sum/np.sum(hist)


def get_third(seq, hist, h):
  '''
    Get the third central moment.

    Parameters:
      seq - The point sequence.
      hist - The histogram of landmarks.
      h - The bin size.
    Return:
      The third central moment.
  '''
  hist_mean = get_mean(seq, hist, h)

  sum = 0
  for idx, count in enumerate(hist):
    # Find bin limits
    start = min(seq) + h*idx
    end = min(seq) + h*(idx + 1)
    if(end > max(seq)):
      end = max(seq)

    sum += (((start + (end - start)/2) - hist_mean)**3)*count

  return sum/np.sum(hist)


def dist_type_to_str(d):
  '''
    Convert the distribution type code to string.

    Parameters:
      t - The distribution type code.
          -1 - Left-skewed
          0 - Symmetric
          +1 - Right-skewed
          +2 - Bimodal
          +3 - Uniform
    Return:
      The distribution type.
  '''
  if d == -1:
    return 'Left-skewed'
  elif d == 0:
    return 'Symmetric'
  elif d == 1:
    return 'Right-skewed'
  elif d == 2:
    return 'Bimodal'
  elif d == 3:
    return 'Uniform'
  else:
    return 'Undefined'


class MixtureModel(stats.rv_continuous):
  '''
    A mixture model of continuous probability distributions
    [https://stackoverflow.com/a/47763145/13123075]
  '''
  def __init__(self, submodels, *args, **kwargs):
    super().__init__(*args, **kwargs)
    self.submodels = submodels

  def _pdf(self, x):
    pdf = self.submodels[0].pdf(x)
    for submodel in self.submodels[1:]:
      pdf += submodel.pdf(x)
    pdf /= len(self.submodels)
    return pdf

  def rvs(self, size):
    submodel_choices = np.random.randint(len(self.submodels), size=size)
    submodel_samples = [submodel.rvs(size=size) for submodel in self.submodels]
    rvs = np.choose(submodel_choices, submodel_samples)
    return rvs

  def sample(self, lower, upper, sample, size):
    '''
      Sample from the mixture model without replacement.
      [https://stackoverflow.com/a/20548895/13123075]

      Parameters:
        self   - The mixture model.
        lower  - The lower bound of the range.
        upper  - The upper bound of the range.
        sample - Items to be excluded.
        size   - The sample size.
      Returns:
        A sample.
    '''
    w = self.pdf(range(int(lower), int(upper) + 1))
    idx = []
    for i in range(int(lower), int(upper) + 1):
      if i not in sample:
        idx.append(i)
    elt = [(math.log(random.random()) / w[i - 1], i) for i in idx]
    return np.sort([x[1] for x in heapq.nlargest(size, elt)])


def plot_init():
  '''
    Initialize the plot.
  '''
  # Reset
  plt.close('all')
  # Style
  plt.style.use('classic')
  # DPI
  plt.figure(dpi=dpi)
  # Font
  plt.rc('font', family='sans-serif')
  plt.rc('font', **{'sans-serif':['Liberation Sans']})
  plt.rc('font', size=font_size)
  # Deal with Type 3 fonts
  plt.rcParams['pdf.fonttype'] = 42
  plt.rcParams['ps.fonttype'] = 42
  # Grid
  plt.setp(plt.figure().add_subplot(111).spines.values(), linewidth=line_width)
  plt.grid(True, axis='y', linewidth=line_width)
  # Ticks
  plt.gca().tick_params(which='both', width=line_width)
  plt.gca().tick_params(which='major', length=tick_length)
  plt.gca().tick_params(which='minor', length=tick_length/2)
  # Colors
  plt.subplot(111).set_prop_cycle('color', ['#212121', '#616161', '#9e9e9e', '#bdbdbd', '#e0e0e0', 'f5f5f5'])
  # Layout
  plt.tight_layout()


def plot_legend():
  '''
    Initialize the plot legend.
  '''
  plt.legend(
    loc='best',
    fontsize=font_size,
    numpoints=1,
    borderpad=.2,      # default: 0.4
    labelspacing=.25,  # default: 0.5
    handlelength=2.0,  # default: 2.0
    handletextpad=.4,  # default: 0.8
    borderaxespad=.5,  # default: 0.5
  )


def save_plot(path):
  '''
    Save the plot to a file.

    Parameters:
      path - The desired location.
    Returns:
      Nothing.
  '''
  # Save plot
  os.makedirs(os.path.dirname(path), exist_ok=True)
  plt.savefig(path, bbox_inches='tight')
  # Clean up
  plt.clf()
  plt.close()


def print_lmdks(seq, lmdks):
  '''
    Print the landmarks.

    Parameters:
      seq - The point sequence.
      lmdks - The landmarks.
    Returns:
      Nothing.
  '''
  print('\n######### Landmarks ##########')
  print(lmdks)
  print('[', end='', flush=True)
  for p in seq:
    if np.any(lmdks[:] == p):
      print('\'', end='', flush=True)
    else:
      print('.', end='', flush=True)
  print(']', end='', flush=True)
  print('\n##############################\n')


def plot_dist(seq, dist):
  '''
    Plot the probability distribution.

    Parameters:
      seq  - The point sequence.
      size - The landmarks' size.
      dist - The distribution type code.
            -1 - Left-skewed
              0 - Symmetric
            +1 - Right-skewed
            +2 - Bimodal
            +3 - Uniform
    Returns:
      Nothing.
  '''
  scale = len(seq)/10
  p = stats.uniform.pdf(seq)
  if dist == -1 or dist == 0 or dist == 1:
    p = MixtureModel([truncnorm(min(seq), max(seq), get_loc(seq, dist), scale)]).pdf(seq)
  elif dist == 2:
    p = MixtureModel([
      truncnorm(min(seq), max(seq), get_loc(seq, -1), scale),
      truncnorm(min(seq), max(seq), get_loc(seq, +1), scale)
    ]).pdf(seq)
  plt.plot(seq, p)
  plt.show()


def simplify_data(seq, lmdks):
  '''
    Get synthetic from real data.

    Parameters:
      seq   - The trajectory.
      lmdks - The landmarks.
    Returns:
      The simplified sequence and landmarks.
  '''
  seq_s = get_seq(1, len(seq))
  lmdks_s = []
  for i, p in enumerate(seq):
    if is_landmark(p, lmdks):
      lmdks_s.append(i + 1)
  return seq_s, np.array(lmdks_s)


def truncnorm(lower, upper, loc, scale):
  '''
    A truncated normal continuous random variable.

    Parameters:
      lower - The lower bound of the range.
      upper - The upper bound of the range.
      loc - The location of the distribution.
      scale - The spread of the distribution.
    Returns:
      A truncated normal continuous random variable.
  '''
  return stats.truncnorm((lower - loc)/scale, (upper - loc)/scale, loc, scale)


def is_valid(coord):
  '''
    Check if coordinates are valid.

    Parameters:
      coord - The coordinates to be checked.
    Returns:
      True  - Valid coordinates.
      False - Invalid coordinates.
  '''
  if coord is None or (float(coord[0]) > 90 or float(coord[0]) < -90  # lat
  or float(coord[1]) > 180 or float(coord[1]) < -180):  #lng
    return False
  return True


def save_data(args, data, name):
  '''
    Save to file and compress.

    Parameters:
      args - The execution arguments.
      data - The data.
      name - The name.
    Returns:
      Nothing.
  '''
  try:
    path = os.path.dirname(os.path.abspath(args.res)) + '/' + name
    if 'lmdks' in name:
      path += '(dis=' + str(args.dist) + ', per=' + str(args.per) + ')'
    print('Saving to %s... ' %(path), end='', flush=True)
    with open(path, 'wb') as file:
      _, idx = np.unique(data, axis=0, return_index=True)
      np.save(file, data[np.sort(idx)], allow_pickle=False)
    print('[OK]')
    # Compress file
    print('Compressing in %s... ' %(args.res), end='', flush=True)
    with zipfile.ZipFile(args.res, 'a') as zip_file:
      zip_file.write(path, os.path.basename(path))
    print('[OK]')
    # Delete file
    print('Deleting %s... ' %(path), end='', flush=True)
    os.remove(path)
    print('[OK]')
  except Exception as e:
    print('[Error: %s]' %(e))


def load_data(args, name):
  '''
    Load data from file.

    Parameters:
      args - The execution arguments.
      data - The data.
      name - The name.
    Returns:
      data - The data set.
             0: uid, 1: lat, 2: lng, 3: tim
  '''
  data = np.array([])
  try:
    path = os.path.dirname(os.path.abspath(args.res)) + '/' + name
    if 'lmdks' in name:
      path += '(dis=' + str(args.dist) + ', per=' + str(args.per) + ')'
    print('Loading %s... ' %(path), end='', flush=True)
    with zipfile.ZipFile(args.res, 'r') as zip_file:
      data = np.load(zip_file.open(os.path.basename(path)))
      print('[OK]')
  except Exception as e:
    print('[Error: %s]' %(e))
  return data


def find_lmdks(usrs_data, args):
  '''
    Find users' landmarks.

    Parameters:
      args      - The execution arguments.
      usrs_data - The users' data.
                  0: uid, 1: lat, 2: lng, 3: tim
    Returns:
      usrs_lmdks - The users' landmarks.
                  0: uid, 1: lat, 2: lng, 3: tim
  '''
  usrs_lmdks = np.empty((0,4), np.float32)
  traj_cur = 0
  usrs = np.unique(usrs_data[:,0])
  for usr_i, usr in enumerate(usrs):
    # Initialize user's landmarks list
    lmdks = []
    traj = usrs_data[usrs_data[:,0]==usr, :]
    traj_cur += len(traj)
    print(
      '[%d%%] Points: %d/%d | Users: %d/%d... '
      %((traj_cur/usrs_data.shape[0])*100, traj_cur, usrs_data.shape[0], usr, usrs[len(usrs) - 1]),
      end='', flush=True
    )
    # Check the actual points
    i = 0
    while i < len(traj) - 1:
      lmdk_cur = []
      if is_valid((traj[i][1], traj[i][2])):
        for j in range(i + 1, len(traj)):
          if is_valid((traj[j][1], traj[j][2])):
            # Add the beginning only the first time
            if j == i + 1:
              lmdk_cur.append([usr, traj[i][1], traj[i][2], traj[i][3]])
            # Add the new point
            lmdk_cur.append([usr, traj[j][1], traj[j][2], traj[j][3]])
            # Distance in meters
            dist = distance((traj[i][1], traj[i][2]), (traj[j][1], traj[j][2])).km*1000
            # Distance exceeded or reached end of iteration
            if dist > args.dist or j == len(traj) - 1:
              # Period in minutes
              per = abs(datetime.fromtimestamp(int(traj[i][3])) - datetime.fromtimestamp(int(traj[j][3]))).total_seconds()/60
              # Check if enough time passed
              if per > args.per:
                # Append current landmark
                lmdks += lmdk_cur
              # Continue checking from the current point
              i = j
              break
      # No landmark was found, continue from next point
      if i == 0 or not is_valid((traj[i][1], traj[i][2])) or 'j' not in vars() or i != j:
        i += 1
    print('[OK]')
    if lmdks:
      usrs_lmdks = np.append(usrs_lmdks, np.asarray(lmdks, dtype=np.float32), axis=0)
  return usrs_lmdks


def find_lmdks_seq(seq, lmdks):
  lmdks_seq = []
  for i, p in enumerate(seq):
    if is_landmark(p, lmdks):
      lmdks_seq.append(i + 1)
  return np.array(lmdks_seq, dtype = int)


def find_lmdks_tim(lmdk_data, seq, uid, pct):
  '''
    Find user's landmarks timestamps.

    Parameters:
      lmdk_data - The landmarks contacts for all users per
                  landmarks percentage.
                  0: uid, 1: lmdk_pct, 2: contacts
      seq       - The users' data.
                  0: uid, 1: lmdk_pct, 2: contacts
      uid       - The user's id that we are interested in.
      pct       - The landmarks percentage.
    Returns:
      lmdks - The user's landmarks timestamps for the given
              landmarks percentage.
              0: tim, 1: uid_a, 2: uid_b, 3: rssi
  '''
  # Initialize user's landmarks
  lmdks = np.empty(0)
  # All the sequence
  if pct == 100:
    # Get all timestamps
    return seq[:, 1]
  # Find for given percentage
  elif pct != 0:
    # All user's landmark contacts for all landmarks percentages
    usr_lmdks = lmdk_data[lmdk_data[:, 0] == uid]
    # User's landmark contacts for given percentage
    pct_lmdks = ast.literal_eval(usr_lmdks[usr_lmdks[:, 1] == str(pct/100)][0][2])
    # Find landmarks timestamps
    for u in pct_lmdks:
      lmdks = np.concatenate((lmdks, np.array(seq[seq[:, 2] == float(u)])[:, 0]))
  return lmdks


def find_lmdks_cont(lmdk_data, seq, uid, pct):
  '''
    Find user's landmarks contacts.

    Parameters:
      lmdk_data - The landmarks contacts for all users per
                  landmarks percentage.
                  0: uid, 1: lmdk_pct, 2: contacts
      seq       - The users' data.
                  0: tim, 1: uid, 2: cont, 3: rssi
      uid       - The user's id that we are interested in.
      pct       - The landmarks percentage.
    Returns:
      lmdks - The user's landmarks contacts for the given
              landmarks percentage.
              0: tim, 1: uid, 2: cont, 3: rssi
  '''
  # Initialize user's landmarks
  lmdks = np.empty(0).reshape(0,4)
  # All the sequence
  if pct == 100:
    # Get all timestamps
    return seq
  # Find for given percentage
  elif pct != 0:
    # All user's landmark contacts for all landmarks percentages
    usr_lmdks = lmdk_data[lmdk_data[:, 0] == uid]
    conts = ast.literal_eval(usr_lmdks[usr_lmdks[:, 1] == str(pct/100)][0][2])
    # Actual landmarks
    for c in conts:
      lmdks = np.vstack([lmdks, seq[seq[:, 2] == c]])
  return lmdks


def lmdks_stats(args, usrs_lmdks):
  '''
    Generate landmarks' stats.

    Parameters:
      args       - The execution arguments.
      usrs_lmdks - The user' landmarks.
                  0: lid, 1: uid, 2: lat, 3: lng, 4: tim
    Returns:
      Nothing.
  '''
  lmdks_stats = {
    'total': 0,
    'len': 0,
    'count': {},
  }
  # Check every user
  usrs = np.unique(usrs_lmdks[:, 1])
  for usr in usrs:
    print(
      '[%d%%] Calculating landmark stats for user %d/%d... '
      %(usr*100/len(usrs), usr, np.max(usrs)),
      end='', flush=True
    )
    # Check each user's landmarks
    for lmdk in np.unique(usrs_lmdks[usrs_lmdks[:, 1] == usr, 0]):
      lmdk_traj = usrs_lmdks[usrs_lmdks[:, 0] == lmdk, :]
      lmdks_stats['total'] += 1
      lmdks_stats['len'] += len(lmdk_traj)
      if lmdks_stats['count'].get(len(lmdk_traj)) == None:
        lmdks_stats['count'][len(lmdk_traj)] = 1
      else:
        lmdks_stats['count'][len(lmdk_traj)] += 1
    print('[OK]')
  hist_min, hist_max = min(lmdks_stats['count'], key=int, default=0), max(lmdks_stats['count'], key=int,  default=0)
  # Make histogram
  x = []
  for i in range(hist_min, hist_max + 1):
    if lmdks_stats['count'].get(i) != None:
      x.extend([i]*lmdks_stats['count'].get(i))
  # Show stats
  print(
    '\n############ Stats ###########\n'
    'Landmarks   : %d\n'
    '  Length\n'
    '    Total   : %d (%.2f%%)\n'
    '    Minimum : %d\n'
    '    Maximum : %d\n'
    '##############################\n'
    %(lmdks_stats['total'], lmdks_stats['len'], (lmdks_stats['len']/args.time)*100, min(lmdks_stats['count'], key=int, default=0), max(lmdks_stats['count'], key=int, default=0))
  )
  # # Initialize plot
  # plot_init()
  # # Set x axis
  # plt.xlabel('Landmarks sequence length')
  # plt.xticks(rotation='vertical')
  # # The y axis
  # plt.ylabel('Number of sequences')
  # plt.yscale('log')
  # Create histogram
  # plt.hist(x, bins=(hist_max - hist_min))
  # Show plot
  # plt.show()
  # Save plot
  # save_plot(os.path.dirname(args.arc) + '/' + 'results' + '(dis=' + str(args.dist) + ', per=' + str(args.per) + ').pdf')


def should_sample(samp_rt):
  '''
    Randomly decide to release with noise.

    Parameters:
      samp_rt - The sampling rate (0, 1]
    Returns:
      True/False
  '''
  return random.choices(population=[True, False], weights=[samp_rt, 1 - samp_rt], k=1)[0]


def is_landmark(p, lmdks):
  '''
    Check is a point is a landmark.

    Parameters:
      p     - The point to check.
      lmdks - A 2d array of landmarks.
    Returns:
      True/False
  '''
  if len(lmdks) > 0 and any(np.equal(lmdks, p).all(1)):
    return True
  return False


if __name__ == '__main__':
  try:
    seq = [1, 2, 3, 4, 5, 6, 7, 8]
    lmdk = [1, 3, 5, 8]
    print(get_hist(seq, lmdk))
    start_time = time.process_time()
    # main(args)
    end_time = time.process_time()
    print('##############################\n'
          'Time       : %.4fs\n'
          '##############################\n'
          %(end_time - start_time), flush=True)
  except KeyboardInterrupt:
    print('Interrupted by user.')
    exit()