Merge branch 'master' of https://git.delkappa.com/manos/the-last-thing

2021-10-12 12:59:12 +02:00
parent b013496f02 5b81702d37
commit a104beb365
7 changed files with 125 additions and 43 deletions
--- a/code/lib/exp_mech.py
+++ b/code/lib/exp_mech.py
@ -11,6 +11,7 @@ from matplotlib import pyplot as plt
 import time
 def score(data, option):
  '''
    The scoring function.
@ -20,10 +21,11 @@ import time
    Returns:
      The score for the option.
  '''
 def score(data, option):
  return (option.sum() - data.sum())
  # return lmdk_lib.get_norm(data, option)
 def exponential(x, R, u, delta, epsilon):
  '''
    The exponential mechanism.
@ -37,7 +39,6 @@ def score(data, option):
      res - A randomly sampled output.
      pr - The PDF of all possible outputs.
  '''
 def exponential(x, R, u, delta, epsilon):
  # Calculate the score for each element of R
  scores = [u(x, r) for r in R]
  # Normalize the scores between 0 and 1
--- a/rslt/bgt_cmp/Copenhagen-sel.pdf
+++ b/rslt/bgt_cmp/Copenhagen-sel.pdf
--- a/rslt/bgt_cmp/HUE-sel.pdf
+++ b/rslt/bgt_cmp/HUE-sel.pdf
--- a/rslt/bgt_cmp/T-drive-sel.pdf
+++ b/rslt/bgt_cmp/T-drive-sel.pdf
--- a/text/bibliography.bib
+++ b/text/bibliography.bib
@ -1761,6 +1761,15 @@
  year      = {2017}
 }
@inproceedings{meshgi2015expanding,
  title={Expanding histogram of colors with gridding to improve tracking accuracy},
  author={Meshgi, Kourosh and Ishii, Shin},
  booktitle={2015 14th IAPR International Conference on Machine Vision Applications (MVA)},
  pages={475--479},
  year={2015},
  organization={IEEE}
 }
@inproceedings{wang2017privacy,
  title        = {Privacy Preserving Anonymity for Periodical SRS Data Publishing},
  author       = {Wang, Jie-Teng and Lin, Wen-Yang},
--- a/text/problem/theotherthing/main.tex
+++ b/text/problem/theotherthing/main.tex
@ -39,10 +39,16 @@ In Example~\ref{ex:lmdk-risk}, we demonstrate the extreme case of the applicatio
 \SetKwData{evalCur}{evalCur}
 \SetKwData{evalOrig}{evalOrig}
 \SetKwData{evalSum}{evalSum}
 \SetKwData{h}{h}
 \SetKwData{hi}{h$_i$}
 \SetKwData{hist}{hist}
 \SetKwData{histCur}{histCur}
 \SetKwData{histTmp}{histTmp}
 \SetKwData{metricCur}{metricCur}
 \SetKwData{metricOrig}{metricOrig}
 \SetKwData{opt}{opt}
 \SetKwData{opti}{opt$_i$}
 \SetKwData{opts}{opts}
 \SetKwData{optim}{optim}
 \SetKwData{optimi}{optim$_i$}
 \SetKwData{opts}{opts}
@ -51,7 +57,10 @@ In Example~\ref{ex:lmdk-risk}, we demonstrate the extreme case of the applicatio
 \SetKwFunction{calcMetric}{calcMetric}
 \SetKwFunction{evalSeq}{evalSeq}
 \SetKwFunction{getCombs}{getCombs}
 \SetKwFunction{getDiff}{getDiff}
 \SetKwFunction{getHist}{getHist}
 \SetKwFunction{getOpts}{getOpts}
 \SetKwFunction{getNorm}{getNorm}
 \input{problem/theotherthing/contribution}
 \input{problem/theotherthing/problem}
--- a/text/problem/theotherthing/solution.tex
+++ b/text/problem/theotherthing/solution.tex
@ -42,16 +42,13 @@ It finds the option that is the most \emph{similar} to the original (Lines~{\ref
  % Evaluate the original
  \evalOrig $\leftarrow$ \evalSeq{$T, \emptyset, L$}\;
  % Get all possible option combinations
  \opts $\leftarrow$ \getOpts{$T, L$}\;
  % Track the minimum (best) evaluation
  \diffMin $\leftarrow$ $\infty$\;
  % Track the optimal sequence (the one with the best evaluation)
-  \optim $\leftarrow$ $[]$\;
+  \opts $\leftarrow$ $[]$\;
-  \ForEach{\opt $\in$ \opts}{ \label{algo:lmdk-sel-opt-for-each}
+  \ForEach{\opt $\in$ \getOpts{$T, L$}}{ \label{algo:lmdk-sel-opt-for-each}
    \evalCur $\leftarrow 0$\;
    \ForEach{\opti $\in$ \opt}{
      \evalCur $\leftarrow$ \evalCur $+$ \evalSeq{$T, \opti, L$}/\#\opt\; \label{algo:lmdk-sel-opt-comparison}
@ -60,10 +57,10 @@ It finds the option that is the most \emph{similar} to the original (Lines~{\ref
    \diffCur $\leftarrow \left|\evalCur - \evalOrig\right|$\;
    \If{\diffCur $<$ \diffMin}{
      \diffMin $\leftarrow$ \diffCur\;
-      \optim $\leftarrow$ \opt\;
+      \opts $\leftarrow$ \opt\;
    }
  } \label{algo:lmdk-sel-opt-end}
-  \Return{\optim}
+  \Return{\opts}
 \end{algorithm}
 Algorithm~\ref{algo:lmdk-sel-opt} guarantees to return the optimal set of dummy {\thethings} with regard to the original set $L$.
@ -73,7 +70,7 @@ Next, we present a heuristic solution with improved time and space requirements.
 \paragraph{Heuristic}
 Algorithm~\ref{algo:lmdk-sel-heur}, follows an incremental methodology.
-At each step it selects a new timestamp that corresponds to a regular ({non-\thething}) event from $T \setminus L$.
+At each step it selects a new timestamp, that corresponds to a regular ({non-\thething}) event from $T \setminus L$, to create an option.
 \begin{algorithm}
  \caption{Heuristic dummy {\thething} set options selection}
@ -82,14 +79,14 @@ At each step it selects a new timestamp that corresponds to a regular ({non-\the
  \DontPrintSemicolon
  \KwData{$T, L$}
-  \KwResult{\optim}
+  \KwResult{\opts}
  \BlankLine
  % Evaluate the original
  \evalOrig $\leftarrow$ \evalSeq{$T, \emptyset, L$}\;
  % Get all possible option combinations
-  \optim $\leftarrow$ $[]$\;
+  \opts $\leftarrow$ $[]$\;
  $L' \leftarrow L$\;
@ -110,45 +107,111 @@ At each step it selects a new timestamp that corresponds to a regular ({non-\the
      \If{\diffCur $<$ \diffMin}{
        \diffMin $\leftarrow$ \diffCur\;
        \optimi $\leftarrow$ \reg\;
-      }\label{algo:lmdk-sel-heur-comparison-end}
+      }\label{algo:lmdk-sel-heur-cmp-end}
    }
    % Save new point to landmarks
    $L'$.add(\optimi)\;
    % Add new option
-    \optim.append($L' \setminus L$)\;
+    \opts.append($L' \setminus L$)\;
  }\label{algo:lmdk-sel-heur-end}
-  \Return{\optim}
+  \Return{\opts}
 \end{algorithm}
-Similar to Algorithm~\ref{algo:lmdk-sel-opt}, the selection is done based on a predefined metric (Lines~{\ref{algo:lmdk-sel-heur-comparison}-\ref{algo:lmdk-sel-heur-comparison-end}}).
+Similar to Algorithm~\ref{algo:lmdk-sel-opt}, it selects new options based on a predefined metric (Lines~{\ref{algo:lmdk-sel-heur-comparison}-\ref{algo:lmdk-sel-heur-cmp-end}}).
 This process (Lines~{\ref{algo:lmdk-sel-heur-while}-\ref{algo:lmdk-sel-heur-end}}) goes on until we select a set that is equal to the size of the series of events, i.e.,~$L' = T$.
-In terms of complexity: given $n$ regular events it requires $\mathcal{O}(n^2)$ time and space.
+In terms of complexity, given $n$ regular events it requires $\mathcal{O}(n^2)$ time and space.
 Note that the reverse heuristic approach, i.e.,~starting with $T$ {\thethings} and removing until $L$, performs similarly with Algorithm~\ref{algo:lmdk-sel-heur}.
 \paragraph{Partitioned}
 We improve the complexity of Algorithm~\ref{algo:lmdk-sel-opt} by partitioning the {\thething} timestamp sequence $L$.
 Algorithm~\ref{algo:lmdk-sel-hist}, \getHist generates a histogram from $L$ with bins of size \h.
 We find \h by using the Freedman–Diaconis rule which is resilient to outliers and takes into account the data variability and data size~\cite{meshgi2015expanding}.
 For every possible histogram version, the \getDiff function finds the difference between two histograms; for this operation we utilize the Euclidean distance~(see Section~\ref{subsec:sel-utl} for more details).
-\mk{WIP: Histograms}
+\begin{algorithm}
  \caption{Partitioned dummy {\thething} set options selection}
  \label{algo:lmdk-sel-hist}
  \DontPrintSemicolon
  \KwData{$T, L$}
  \KwResult{\opts}
  \BlankLine
  \hist, \h $\leftarrow$ \getHist{$T, L$}\;
  \histCur $\leftarrow$ hist\;
  \opts $\leftarrow$ $[]$\;
  \While{sum($L'$) $\neq$ len($T$)}{ \label{algo:lmdk-sel-hist-while}
    % Track the minimum (best) evaluation
    \diffMin $\leftarrow$ $\infty$\;
    % The candidate option
    \opt $\leftarrow$ \histCur\;
    % Check every possibility
    \ForEach{\hi \reg $L'$}{ \label{algo:lmdk-sel-hist-cmp-start}
      % Can we add one more point?
      \If{\hi $+$ $1$ $\leq$ \h}{
        \histTmp $\leftarrow$ \histCur\;
        \histTmp$[i]$ $\leftarrow$ \histTmp$[i]$ $+$ $1$\;
        % Find difference from original
        \diffCur $\leftarrow$ \getDiff{\hist, \histTmp}\;
        % Remember if it is the best that you've seen
        \If{\diffCur $<$ \diffMin}{ \label{algo:lmdk-sel-hist-cmp}
          \diffMin $\leftarrow$ \diffCur\;
          \opt $\leftarrow$ \histTmp\;
        }
      }
    } \label{algo:lmdk-sel-hist-cmp-end}
    % Update current histogram
    \histCur $\leftarrow$ \opt\;
    % Add current best to options
    \opts $\leftarrow$ \opt\;
  } \label{algo:lmdk-sel-hist-end}
  \Return{\opts}
 \end{algorithm}
 Between Lines~{\ref{algo:lmdk-sel-hist-cmp-start}-\ref{algo:lmdk-sel-hist-cmp-end}} we check every possible histogram version by incrementing each bin by $1$ and comparing it to the original (Line~\ref{algo:lmdk-sel-hist-cmp}).
 In the end of the process, we return \opts which contains all the versions of \hist that are closest to \hist for all possible sizes of \hist.
 \subsubsection{Privacy-preserving option selection}
 \label{subsec:lmdk-opt-sel}
-\mk{WIP}
+The Algorithms of Section~\ref{subsec:lmdk-set-opts} return a set of possible versions of the original {\thething} set $L$ by adding extra timestamps in it from the series of events at timestamps $T \supseteq L$.
 In the next step of the process, we randomly select a set by utilizing the exponential mechanism (Section~\ref{subsec:prv-mech}).
 Prior to selecting a set, the exponential mechanism evaluates each set using a score function.
 One way evaluate each set is by taking into account the temporal position the events in the sequence.
 % Nearby events
 Events that occur at recent timestamps are more likely to reveal sensitive information regarding the users involved~\cite{kellaris2014differentially}.
 Thus, taking into account more recent events with respect to {\thethings} can result in less privacy loss and better privacy protection overall.
 This leads to worse data utility.
 % Depending on the {\thething} discovery technique
 The values of events near a {\thething} are usually similar to that of the latter.
 Therefore, privacy-preserving mechanisms are likely to approximate their values based on the nearest {\thething} instead of investing extra privacy budget to perturb their actual values; thus, spending less privacy budget.
 Saving privacy budget for releasing perturbed versions of actual event values can bring about better data utility. 
 % Distant events
-However, indicating the existence of randomized/dummy {\thethings} nearby actual {\thethings} can increase the adversarial confidence regarding the location of the latter within a series of events.
+However, indicating the existence of dummy {\thethings} nearby actual {\thethings} can increase the adversarial confidence regarding the location of the latter within a series of events.
-Hence, choosing randomized/dummy {\thethings} far from the actual {\thethings} (and thus less relevant) can limit the final privacy loss.
+Hence, choosing dummy {\thethings} far from the actual {\thethings} (and thus less relevant) can limit the final privacy loss.
 Another approach for the score function is to consider the number of events in each set.
 On the one hand, sets with more dummy {\thethings} may render actual {\thethings} more indistinguishable probabilistically.
 That is due to the fact that, it is harder for an adversary to pick a {\thething} when the ratio of {\thethings} to the size of the set gets lower.
 On the other hand, more dummy {\thethings} lead to distributing the privacy budget to more events, and therefore investing less at each timestamp.
 Thus, providing a better level of privacy protection.