the-last-thing/code/lib/lmdk_bgt.py

#!/usr/bin/env python3

from datetime import datetime
from geopy.distance import distance
import math
import numpy as np
import random
import sympy as sp
import time
import lmdk_lib
from matplotlib import pyplot as plt


def draw_bgts(title, bgts):
  '''
    Plots the allocated budget.

    Parameters:
      title - The title of the plot.
      bgts - The allocated privacy budget.
    Returns:
      Nothing.
  '''
  lmdk_lib.plot_init()
  p = np.arange(1, len(bgts) + 1, 1)
  plt.plot(
    p,
    bgts,
    linewidth=lmdk_lib.line_width,
    markersize=lmdk_lib.marker_size,
    markeredgewidth=0,
    label=r'$\varepsilon$',
    marker='s'
  )
  lmdk_lib.plot_legend()
  # Set plot box
  plt.axis([1, len(bgts), .0, max(bgts) +  max(bgts)/2])
  # Set x axis label
  plt.xlabel('Timestamp')
  # Set y axis label
  plt.ylabel('Privacy budget')
  plt.title(title.replace('_', '-'))
  plt.show()


def validate_bgts(seq, lmdks, epsilon, bgts):
  '''
    Budget allocation validation.

    Parameters:
      seq - The point sequence.
      lmdks - The landmarks.
      epsilon - The available privacy budget.
      bgts - The privacy budget allocation.
    Returns:
      The index of the privacy budget that caused the failure or -1 if successful.
  '''
  bgts_sum = .0
  # Landmarks
  for i, p in enumerate(seq):
    if np.any(lmdks[:] == p):
      bgts_sum += bgts[i]
  # Regular events
  bgts_max = bgts_sum
  for i, p in enumerate(seq):
    if not np.any(lmdks[:] == p):
      bgts_cur = bgts_sum + bgts[i]
      if bgts_cur > bgts_max:
        bgts_max = bgts_cur
      if bgts_cur > epsilon and not math.isclose(bgts_cur, epsilon, rel_tol=.001):
        print('Budget allocation validation failed: %.2f%% (%.4f/%.4f)' %(100*bgts_max/epsilon, bgts_max, epsilon))
        return i
  print(
    'Landmark budget allocation: %.2f%% (%.4f/%.4f)\n'
    'Overall budget allocation : %.2f%% (%.4f/%.4f)\n'
    %(100*bgts_max/epsilon, bgts_max, epsilon,
    100*sum(bgts)/epsilon, sum(bgts), epsilon,))
  return -1


def uniform(seq, lmdks, epsilon):
  '''
    Uniform budget allocation.

    Parameters:
      seq - The point sequence.
      lmdks - The landmarks.
      epsilon - The available privacy budget.
    Returns:
      bgts - The privacy budget allocation.
  '''
  bgts = np.zeros(len(seq))
  # Allocate enough budget for all landmarks and one regular point
  k = len(lmdks) + 1
  # All points are landmarks
  if k > len(seq):
    k = len(lmdks)
  # Allocate the budget
  for i, _ in enumerate(bgts):
    bgts[i] = epsilon/k
  return bgts


def geometric(seq, lmdks, epsilon):
  '''
    Geometric budget allocation.

    Parameters:
      seq - The point sequence.
      lmdks - The landmarks.
      epsilon - The available privacy budget.
    Returns:
      bgts - The privacy budget distribution.
  '''
  bgts = np.zeros([len(seq)])
  epsilon_avail = epsilon
  # Landmarks
  for i, p in enumerate(seq):
    if np.any(lmdks[:] == p):
      bgts[i] = epsilon_avail/2
      epsilon_avail = epsilon_avail - epsilon_avail/2
  # Regular events
  for i, p in enumerate(seq):
    if not np.any(lmdks[:] == p):
      bgts[i] = epsilon_avail
  return bgts


def exponential(seq, lmdks, epsilon):
  '''
    Exponential uniform budget allocation (max to user-level).

    Parameters:
      seq - The point sequence.
      lmdks - The landmarks.
      epsilon - The available privacy budget.
    Returns:
      bgts - The privacy budget allocation.
  '''
  # Fallback to uniform if zero or max landmarks
  if len(lmdks) == 0 or len(lmdks) == len(seq):
    return uniform(seq, lmdks, epsilon)
  # # In case of seq == lmdks
  # l = 0
  # N = epsilon/len(seq)
  # Otherwise
  bgts = np.zeros([len(seq)])
  if len(seq) != len(lmdks):
    # Find worst case regural point
    p_sel = 0
    for p in seq:
      if not np.any(lmdks[:] == p):
        p_sel = p
        break
    # List all landmark timestamps with the worst regular
    points = np.append(lmdks, [p_sel])
    points = np.sort(points)
    # epsilon_t = N*e^-l*t
    l = sp.symbols('l', real=True)
    # Cumulative allocation at landmarks and one extra point, i.e., L union {t}
    T = seq.max()
    # Bounding the privacy budgets at L union {t}
    # epsilon = sum(N*e^-l*t) for t in L union {t}
    t = sp.symbols('t')
    eq = 0
    for t in points:
      eq += (((epsilon/len(seq))/(sp.exp(-1*l*T)))*sp.exp(-1*l*t))
    try:
      l = sp.solve(eq - epsilon, simplify=False, rational=False)[0]
      # l = sp.solveset(eq <= epsilon, l, sp.S.Reals).sup
    except Exception:
      return bgts
    # Allocate to the last point T epsilon/len(seq), i.e., user-level
    N = (epsilon/len(seq))/sp.exp(-1*l*T)
  # Allocate the budget
  for i, t in enumerate(seq):
    bgts[i] = N*sp.exp(-1*l*t)
  return bgts


def linear_zero(seq, lmdks, epsilon):
  '''
    Linear uniform budget allocation (max to zero).

    Parameters:
      seq - The point sequence.
      lmdks - The landmarks.
      epsilon - The available privacy budget.
    Returns:
      bgts - The privacy budget allocation.
  '''
  bgts = np.zeros([len(seq)])
  # Find worst case regural point
  p_sel = 0
  for p in seq:
    if not np.any(lmdks[:] == p):
      p_sel = p
      break
  # Sum all landmark timestamps with the worst regular
  sum_cur = p_sel
  for p in lmdks:
    sum_cur += p
  # epsilon_t = a*t + b
  b = sp.symbols('b')
  # Cumulative allocation at landmarks and one extra point
  T = seq[len(seq) - 1]
  b = sp.solve(b - ((b/T)*sum_cur + epsilon)/(len(lmdks) + 1))[0]
  # Allocate to the last point 0
  a = -b/T
  # Allocate the budget
  for i, t in enumerate(seq):
    bgts[i] = a*t + b
  return bgts


def linear(seq, lmdks, epsilon):
  '''
    Linear uniform budget allocation (max to user-level).

    Parameters:
      seq - The point sequence.
      lmdks - The landmarks.
      epsilon - The available privacy budget.
    Returns:
      bgts - The privacy budget allocation.
  '''
  # Fallback to uniform if zero or max landmarks
  if len(lmdks) == 0 or len(lmdks) == len(seq):
    return uniform(seq, lmdks, epsilon)
  # Find worst case regural point
  p_sel = 0
  for p in seq:
    if not np.any(lmdks[:] == p):
      p_sel = p
      break
  # Sum all landmark timestamps with the worst regular
  sum_cur = p_sel + np.sum(lmdks)
  # epsilon_t = a*t + b
  b = sp.symbols('b', real=True)
  # Cumulative allocation at landmarks and one extra point, i.e., L union {t}
  T = seq.max()
  # Number of points to take into account
  k = len(lmdks) + 1
  # if len(lmdks) == len(seq):
  #   k = len(lmdks)
  # Bounding the privacy budgets at L union {t}
  # epsilon = a*sum(L union {t}) + |L union {t}|*b
  b = sp.solve(b + (((epsilon/len(seq) - b)/T)*sum_cur - epsilon)/k, simplify=False, rational=False)[0]
  # Allocate to the last point T epsilon/len(seq), i.e., user-level
  a = (epsilon/len(seq) - b)/T
  # Allocate the budget
  bgts = np.zeros([len(seq)])
  for i, t in enumerate(seq):
    bgts[i] = a*t + b
  return bgts


def plot_line(x_i, arr, label, marker, line):
  plt.plot(x_i,
          arr,
          label=label,
          color='black',
          marker=marker,
          linestyle=line,
          # linewidth=1.0,
          markerfacecolor='none')


def stepped(seq, lmdks, epsilon):
  '''
    Stepped budget allocation.

    Parameters:
      seq - The point sequence.
      lmdks - The landmarks.
      epsilon - The available privacy budget.
    Returns:
      bgts - The privacy budget allocation.
  '''
  bgts = np.zeros([len(seq)])
  epsilon_avail = epsilon/2
  for i, p in enumerate(seq):
    # Reduce the available budget when a landmark is reached
    if np.any(lmdks[:] == p):
      epsilon_avail = epsilon_avail/2
    bgts[i] = epsilon_avail
  return bgts


def adaptive(seq, lmdks, epsilon, inc_rt, dec_rt):
  '''
    Adaptive budget allocation.

    Parameters:
      seq - The point sequence.
      lmdks - The landmarks.
      epsilon - The available privacy budget.
      inc_rt - Sampling rate increase rate.
      dec_rt - Sampling rate decrease rate.
    Returns:
      rls_data - The perturbed data.
      bgts - The privacy budget allocation.
      skipped - The number of skipped releases.
  '''
  # Uniform budget allocation
  bgts = uniform(seq, lmdks, epsilon)
  # Released
  rls_data = [None]*len(seq)
  # The sampling rate
  samp_rt = 1
  # Track landmarks
  lmdk_cur = 0
  # Track skipped releases
  skipped = 0
  for i, p in enumerate(seq):
    # Check if current point is a landmark
    if lmdk_lib.is_landmark(p, lmdks):
      lmdk_cur += 1
    # Get coordinates
    loc = (p[1], p[2])
    if lmdk_lib.should_sample(samp_rt) or i == 0:
      # Add noise to original data
      new_loc = lmdk_lib.add_polar_noise(loc, bgts[i])
      rls_data[i] = [p[0], new_loc[0], new_loc[1], p[3]]
      # Adjust sampling rate
      if i > 0:
        if distance((rls_data[i - 1][1], rls_data[i - 1][2]), new_loc).km*1000 < 1/bgts[i]:
          # Decrease
          samp_rt -= samp_rt*dec_rt
        else:
          # Increase
          samp_rt += (1 - samp_rt)*inc_rt
    else:
      skipped += 1
      # Skip current release and approximate with previous
      rls_data[i] = rls_data[i - 1]
      if lmdk_lib.is_landmark(p, lmdks):
        # Allocate the current budget to the following releases uniformly
        for j in range(i + 1, len(seq)):
          bgts[j] += bgts[i]/(len(lmdks) - lmdk_cur + 1)
      # No budget was spent
      bgts[i] = 0
  return rls_data, bgts, skipped


def skip(seq, lmdks, epsilon):
  '''
    Skip landmarks.

    Parameters:
      seq - The point sequence.
      lmdks - The landmarks.
      epsilon - The available privacy budget.
    Returns:
      rls_data - The perturbed data.
      bgts - The privacy budget allocation.
  '''
  # Event-level budget allocation
  bgts = np.array(len(seq)*[epsilon])
  # Released
  rls_data = [None]*len(seq)
  for i, p in enumerate(seq):
    # Get coordinates
    loc = (p[1], p[2])
    # Add noise to original data
    new_loc = lmdk_lib.add_polar_noise(loc, bgts[i])
    # Check if current point is a landmark
    if lmdk_lib.is_landmark(p, lmdks):
      if i > 0:
        # Approximate with previous location
        new_loc = (rls_data[i - 1][1], rls_data[i - 1][2])
      bgts[i] = 0
    rls_data[i] = [p[0], new_loc[0], new_loc[1], p[3]]
  return rls_data, bgts


def sample(seq, lmdks, epsilon):
  '''
    Publish randomly.

    Parameters:
      seq - The point sequence.
      lmdks - The landmarks.
      epsilon - The available privacy budget.
    Returns:
      rls_data - The perturbed data.
      bgts - The privacy budget allocation.
      skipped - The number of skipped releases.
  '''
  # Uniform budget allocation
  bgts = uniform(seq, lmdks, epsilon)
  # Released
  rls_data = [None]*len(seq)
  # The sampling rate
  # (publish with 50% chance)
  samp_rt = .5
  # Track landmarks
  lmdk_cur = 0
  # Track skipped releases
  skipped = 0
  for i, p in enumerate(seq):
    # Check if current point is a landmark
    if lmdk_lib.is_landmark(p, lmdks):
      lmdk_cur += 1
    # Get coordinates
    loc = (p[1], p[2])
    if i == 0 or lmdk_lib.should_sample(samp_rt):
      # Add noise to original data
      new_loc = lmdk_lib.add_polar_noise(loc, bgts[i])
      rls_data[i] = [p[0], new_loc[0], new_loc[1], p[3]]
    else:
      skipped += 1
      # Skip current release and approximate with previous
      rls_data[i] = rls_data[i - 1]
      if lmdk_lib.is_landmark(p, lmdks):
        # Allocate the current budget to the following releases uniformly
        for j in range(i + 1, len(seq)):
          bgts[j] += bgts[i]/(len(lmdks) - lmdk_cur + 1)
      # No budget was spent
      bgts[i] = 0
  return rls_data, bgts, skipped


def discount(seq, lmdks, epsilon):
  '''
    Temporally discounted sampling.

    Parameters:
      seq - The point sequence.
      lmdks - The landmarks.
      epsilon - The available privacy budget.
    Returns:
      rls_data - The perturbed data.
      bgts - The privacy budget allocation.
      skipped - The number of skipped releases.
  '''
  # Uniform budget allocation
  bgts = uniform(seq, lmdks, epsilon)
  # Released
  rls_data = [None]*len(seq)
  # Track landmarks
  lmdk_cur = 0
  # Track skipped releases
  skipped = 0
  # Initialize the sampling rate
  samp_rt = .5
  for i, p in enumerate(seq):
    # Check if current point is a landmark
    if lmdk_lib.is_landmark(p, lmdks):
      lmdk_cur += 1
    # Adjust the sampling rate
    if len(lmdks) > 1:
      samp_rt = lmdks[lmdk_cur - 1][3] / lmdks[len(lmdks) - 1][3]
    # Get coordinates
    loc = (p[1], p[2])
    if i == 0 or lmdk_lib.should_sample(samp_rt):
      # Add noise to original data
      new_loc = lmdk_lib.add_polar_noise(loc, bgts[i])
      rls_data[i] = [p[0], new_loc[0], new_loc[1], p[3]]
    else:
      skipped += 1
      # Skip current release and approximate with previous
      rls_data[i] = rls_data[i - 1]
      if lmdk_lib.is_landmark(p, lmdks):
        # Allocate the current budget to the following releases uniformly
        for j in range(i + 1, len(seq)):
          bgts[j] += bgts[i]/(len(lmdks) - lmdk_cur + 1)
      # No budget was spent
      bgts[i] = 0
  return rls_data, bgts, skipped


def incremental(seq, lmdks, epsilon, diff_rt):
  '''
    Incremental sampling.

    Parameters:
      seq - The point sequence.
      lmdks - The landmarks.
      epsilon - The available privacy budget.
      diff_rt - Sampling rate difference.
    Returns:
      rls_data - The perturbed data.
      bgts - The privacy budget allocation.
      skipped - The number of skipped releases.
  '''
  # Uniform budget allocation
  bgts = uniform(seq, lmdks, epsilon)
  # Released
  rls_data = [None]*len(seq)
  # The sampling rate
  # (publish with 50% chance)
  samp_rt = .5
  # Track landmarks
  lmdk_cur = 0
  # Track consecutive landmarks
  lmdk_con = 0
  # Track skipped releases
  skipped = 0
  for i, p in enumerate(seq):
    # Check if current point is a landmark
    if lmdk_lib.is_landmark(p, lmdks):
      lmdk_cur += 1
      # Increase sampling rate
      samp_rt += (1 - samp_rt)*diff_rt
    else:
      # Decrease sampling rate
      samp_rt -= samp_rt*diff_rt
    # Get coordinates
    loc = (p[1], p[2])
    if i == 0 or lmdk_lib.should_sample(samp_rt):
      # Add noise to original data
      new_loc = lmdk_lib.add_polar_noise(loc, bgts[i])
      rls_data[i] = [p[0], new_loc[0], new_loc[1], p[3]]
      # Adjust sampling rate
      if lmdk_lib.is_landmark(p, lmdks):
        # Adjust consecutive landmarks tracking
        lmdk_con += 1
    else:
      skipped += 1
      # Skip current release and approximate with previous
      rls_data[i] = rls_data[i - 1]
      if lmdk_lib.is_landmark(p, lmdks):
        # Reset consecutive landmarks tracking
        lmdk_con = 0
        # Allocate the current budget to the following releases uniformly
        for j in range(i + 1, len(seq)):
          bgts[j] += bgts[i]/(len(lmdks) - lmdk_cur + 1)
      # No budget was spent
      bgts[i] = 0
  return rls_data, bgts, skipped


def uniform_r(seq, lmdks, epsilon):
  # Released
  rls_data = [None]*len(seq)
  # Budgets
  bgts = uniform(seq, lmdks, epsilon)
  for i, p in enumerate(seq):
    # Get coordinates
    loc = (p[1], p[2])
    # Add noise to original data
    new_loc = lmdk_lib.add_polar_noise(loc, bgts[i])
    rls_data[i] = [p[0], new_loc[0], new_loc[1], p[3]]
  return rls_data, bgts


def uniform_cont(seq, lmdks, epsilon):
  # Released
  rls_data = [None]*len(seq)
  # Budgets
  bgts = uniform(seq, lmdks, epsilon)
  for i, p in enumerate(seq):
    r = p[2] in lmdks
    # [original, perturbed]
    rls_data[i] = [r, lmdk_lib.randomized_response(r, bgts[i])]
  return rls_data, bgts


def utility_analysis(seq, lmdks, o, epsilon):
  '''
    Analyze the utility.

    Parameters:
      seq     - The point sequence.
      lmdks   - The landmarks.
      o       - The perturbed data.
      epsilon - The available privacy budget.
    Returns:
      Nothing.
  '''
  # Estimate Mean Absolute Error
  mae = 0
  # Compare with uniform
  mae_u = 0
  bgts_u = uniform(seq, lmdks, epsilon)
  for i, p in enumerate(seq):
    mae += distance((p[1], p[2]), (o[i][1], o[i][2])).km*1000/len(seq)
    new_loc = lmdk_lib.add_polar_noise((p[1], p[2]), bgts_u[i])
    mae_u += distance((p[1], p[2]), (new_loc[0], new_loc[1])).km*1000/len(seq)

  print(
    '\n########## Analysis ##########\n'
    'MAE uniform : %.2fm\n'
    'MAE current : %.2fm\n'
    'Difference  : %.2f%%\n'
    '##############################\n'
    %(mae_u, mae, 100*(mae - mae_u)/mae_u)
  )


def mae(seq, o):
  mae = 0
  for i, p in enumerate(seq):
    mae += distance((p[1], p[2]), (o[i][1], o[i][2])).km*1000/len(seq)
  return mae


def mae_cont(o):
  mae = 0
  for i, p in enumerate(o):
    # Check if original differs from final
    if p[0] != p[1]:
      mae += 1/len(o)
  print(mae)
  return mae