Merge branch 'master' of https://git.delkappa.com/manos/the-last-thing

2021-10-12 12:59:12 +02:00
parent b013496f02 5b81702d37
commit a104beb365
7 changed files with 125 additions and 43 deletions
--- a/code/lib/exp_mech.py
+++ b/code/lib/exp_mech.py
@ -11,33 +11,34 @@ from matplotlib import pyplot as plt
 import time


-'''
-  The scoring function.
-
-  Parameters:
-    data - The data.
-    option - The option to evaluate.
-  Returns:
-    The score for the option.
-'''
 def score(data, option):
+  '''
+    The scoring function.
+
+    Parameters:
+      data - The data.
+      option - The option to evaluate.
+    Returns:
+      The score for the option.
+  '''
  return (option.sum() - data.sum())
+  # return lmdk_lib.get_norm(data, option)


-'''
-  The exponential mechanism.
-
-  Parameters:
-    x - The data.
-    R - The possible outputs.
-    u - The scoring function.
-    delta - The sensitivity of the scoring function.
-    epsilon - The privacy budget.
-  Returns:
-    res - A randomly sampled output.
-    pr - The PDF of all possible outputs.
-'''
 def exponential(x, R, u, delta, epsilon):
+  '''
+    The exponential mechanism.
+
+    Parameters:
+      x - The data.
+      R - The possible outputs.
+      u - The scoring function.
+      delta - The sensitivity of the scoring function.
+      epsilon - The privacy budget.
+    Returns:
+      res - A randomly sampled output.
+      pr - The PDF of all possible outputs.
+  '''
  # Calculate the score for each element of R
  scores = [u(x, r) for r in R]
  # Normalize the scores between 0 and 1
--- a/rslt/bgt_cmp/Copenhagen-sel.pdf
+++ b/rslt/bgt_cmp/Copenhagen-sel.pdf
--- a/rslt/bgt_cmp/HUE-sel.pdf
+++ b/rslt/bgt_cmp/HUE-sel.pdf
--- a/rslt/bgt_cmp/T-drive-sel.pdf
+++ b/rslt/bgt_cmp/T-drive-sel.pdf
--- a/text/bibliography.bib
+++ b/text/bibliography.bib
@ -1761,6 +1761,15 @@
  year      = {2017}
 }

+@inproceedings{meshgi2015expanding,
+  title={Expanding histogram of colors with gridding to improve tracking accuracy},
+  author={Meshgi, Kourosh and Ishii, Shin},
+  booktitle={2015 14th IAPR International Conference on Machine Vision Applications (MVA)},
+  pages={475--479},
+  year={2015},
+  organization={IEEE}
+}
+
@inproceedings{wang2017privacy,
  title        = {Privacy Preserving Anonymity for Periodical SRS Data Publishing},
  author       = {Wang, Jie-Teng and Lin, Wen-Yang},
--- a/text/problem/theotherthing/main.tex
+++ b/text/problem/theotherthing/main.tex
@ -39,10 +39,16 @@ In Example~\ref{ex:lmdk-risk}, we demonstrate the extreme case of the applicatio
 \SetKwData{evalCur}{evalCur}
 \SetKwData{evalOrig}{evalOrig}
 \SetKwData{evalSum}{evalSum}
+\SetKwData{h}{h}
+\SetKwData{hi}{h$_i$}
+\SetKwData{hist}{hist}
+\SetKwData{histCur}{histCur}
+\SetKwData{histTmp}{histTmp}
 \SetKwData{metricCur}{metricCur}
 \SetKwData{metricOrig}{metricOrig}
 \SetKwData{opt}{opt}
 \SetKwData{opti}{opt$_i$}
+\SetKwData{opts}{opts}
 \SetKwData{optim}{optim}
 \SetKwData{optimi}{optim$_i$}
 \SetKwData{opts}{opts}
@ -51,7 +57,10 @@ In Example~\ref{ex:lmdk-risk}, we demonstrate the extreme case of the applicatio
 \SetKwFunction{calcMetric}{calcMetric}
 \SetKwFunction{evalSeq}{evalSeq}
 \SetKwFunction{getCombs}{getCombs}
+\SetKwFunction{getDiff}{getDiff}
+\SetKwFunction{getHist}{getHist}
 \SetKwFunction{getOpts}{getOpts}
+\SetKwFunction{getNorm}{getNorm}

 \input{problem/theotherthing/contribution}
 \input{problem/theotherthing/problem}
--- a/text/problem/theotherthing/solution.tex
+++ b/text/problem/theotherthing/solution.tex
@ -42,16 +42,13 @@ It finds the option that is the most \emph{similar} to the original (Lines~{\ref
  % Evaluate the original
  \evalOrig $\leftarrow$ \evalSeq{$T, \emptyset, L$}\;

-  % Get all possible option combinations
-  \opts $\leftarrow$ \getOpts{$T, L$}\;
-
  % Track the minimum (best) evaluation
  \diffMin $\leftarrow$ $\infty$\;

  % Track the optimal sequence (the one with the best evaluation)
-  \optim $\leftarrow$ $[]$\;
+  \opts $\leftarrow$ $[]$\;

-  \ForEach{\opt $\in$ \opts}{ \label{algo:lmdk-sel-opt-for-each}
+  \ForEach{\opt $\in$ \getOpts{$T, L$}}{ \label{algo:lmdk-sel-opt-for-each}
    \evalCur $\leftarrow 0$\;
    \ForEach{\opti $\in$ \opt}{
      \evalCur $\leftarrow$ \evalCur $+$ \evalSeq{$T, \opti, L$}/\#\opt\; \label{algo:lmdk-sel-opt-comparison}
@ -60,10 +57,10 @@ It finds the option that is the most \emph{similar} to the original (Lines~{\ref
    \diffCur $\leftarrow \left|\evalCur - \evalOrig\right|$\;
    \If{\diffCur $<$ \diffMin}{
      \diffMin $\leftarrow$ \diffCur\;
-      \optim $\leftarrow$ \opt\;
+      \opts $\leftarrow$ \opt\;
    }
  } \label{algo:lmdk-sel-opt-end}
-  \Return{\optim}
+  \Return{\opts}
 \end{algorithm}

 Algorithm~\ref{algo:lmdk-sel-opt} guarantees to return the optimal set of dummy {\thethings} with regard to the original set $L$.
@ -73,7 +70,7 @@ Next, we present a heuristic solution with improved time and space requirements.

 \paragraph{Heuristic}
 Algorithm~\ref{algo:lmdk-sel-heur}, follows an incremental methodology.
-At each step it selects a new timestamp that corresponds to a regular ({non-\thething}) event from $T \setminus L$.
+At each step it selects a new timestamp, that corresponds to a regular ({non-\thething}) event from $T \setminus L$, to create an option.

 \begin{algorithm}
  \caption{Heuristic dummy {\thething} set options selection}
@ -82,14 +79,14 @@ At each step it selects a new timestamp that corresponds to a regular ({non-\the
  \DontPrintSemicolon

  \KwData{$T, L$}
-  \KwResult{\optim}
+  \KwResult{\opts}
  \BlankLine

  % Evaluate the original
  \evalOrig $\leftarrow$ \evalSeq{$T, \emptyset, L$}\;

  % Get all possible option combinations
-  \optim $\leftarrow$ $[]$\;
+  \opts $\leftarrow$ $[]$\;

  $L' \leftarrow L$\;

@ -110,45 +107,111 @@ At each step it selects a new timestamp that corresponds to a regular ({non-\the
      \If{\diffCur $<$ \diffMin}{
        \diffMin $\leftarrow$ \diffCur\;
        \optimi $\leftarrow$ \reg\;
-      }\label{algo:lmdk-sel-heur-comparison-end}
+      }\label{algo:lmdk-sel-heur-cmp-end}
    }

    % Save new point to landmarks
    $L'$.add(\optimi)\;

    % Add new option
-    \optim.append($L' \setminus L$)\;
+    \opts.append($L' \setminus L$)\;
  }\label{algo:lmdk-sel-heur-end}

-  \Return{\optim}
+  \Return{\opts}
 \end{algorithm}

-Similar to Algorithm~\ref{algo:lmdk-sel-opt}, the selection is done based on a predefined metric (Lines~{\ref{algo:lmdk-sel-heur-comparison}-\ref{algo:lmdk-sel-heur-comparison-end}}).
+Similar to Algorithm~\ref{algo:lmdk-sel-opt}, it selects new options based on a predefined metric (Lines~{\ref{algo:lmdk-sel-heur-comparison}-\ref{algo:lmdk-sel-heur-cmp-end}}).
 This process (Lines~{\ref{algo:lmdk-sel-heur-while}-\ref{algo:lmdk-sel-heur-end}}) goes on until we select a set that is equal to the size of the series of events, i.e.,~$L' = T$.

-In terms of complexity: given $n$ regular events it requires $\mathcal{O}(n^2)$ time and space.
+In terms of complexity, given $n$ regular events it requires $\mathcal{O}(n^2)$ time and space.
 Note that the reverse heuristic approach, i.e.,~starting with $T$ {\thethings} and removing until $L$, performs similarly with Algorithm~\ref{algo:lmdk-sel-heur}.


+\paragraph{Partitioned}
+We improve the complexity of Algorithm~\ref{algo:lmdk-sel-opt} by partitioning the {\thething} timestamp sequence $L$.
+Algorithm~\ref{algo:lmdk-sel-hist}, \getHist generates a histogram from $L$ with bins of size \h.
+We find \h by using the Freedman–Diaconis rule which is resilient to outliers and takes into account the data variability and data size~\cite{meshgi2015expanding}.
+For every possible histogram version, the \getDiff function finds the difference between two histograms; for this operation we utilize the Euclidean distance~(see Section~\ref{subsec:sel-utl} for more details).

-\mk{WIP: Histograms}
+\begin{algorithm}
+  \caption{Partitioned dummy {\thething} set options selection}
+  \label{algo:lmdk-sel-hist}
+
+  \DontPrintSemicolon
+
+  \KwData{$T, L$}
+  \KwResult{\opts}
+  \BlankLine
+
+  \hist, \h $\leftarrow$ \getHist{$T, L$}\;
+
+  \histCur $\leftarrow$ hist\;
+
+  \opts $\leftarrow$ $[]$\;
+
+  \While{sum($L'$) $\neq$ len($T$)}{ \label{algo:lmdk-sel-hist-while}
+    % Track the minimum (best) evaluation
+    \diffMin $\leftarrow$ $\infty$\;
+
+    % The candidate option
+    \opt $\leftarrow$ \histCur\;
+
+    % Check every possibility
+    \ForEach{\hi \reg $L'$}{ \label{algo:lmdk-sel-hist-cmp-start}
+
+      % Can we add one more point?
+      \If{\hi $+$ $1$ $\leq$ \h}{
+        \histTmp $\leftarrow$ \histCur\;
+        \histTmp$[i]$ $\leftarrow$ \histTmp$[i]$ $+$ $1$\;
+        % Find difference from original
+        \diffCur $\leftarrow$ \getDiff{\hist, \histTmp}\;
+
+        % Remember if it is the best that you've seen
+        \If{\diffCur $<$ \diffMin}{ \label{algo:lmdk-sel-hist-cmp}
+          \diffMin $\leftarrow$ \diffCur\;
+          \opt $\leftarrow$ \histTmp\;
+        }
+
+      }
+
+    } \label{algo:lmdk-sel-hist-cmp-end}
+
+    % Update current histogram
+    \histCur $\leftarrow$ \opt\;
+    % Add current best to options
+    \opts $\leftarrow$ \opt\;
+
+  } \label{algo:lmdk-sel-hist-end}
+
+  \Return{\opts}
+\end{algorithm}
+
+Between Lines~{\ref{algo:lmdk-sel-hist-cmp-start}-\ref{algo:lmdk-sel-hist-cmp-end}} we check every possible histogram version by incrementing each bin by $1$ and comparing it to the original (Line~\ref{algo:lmdk-sel-hist-cmp}).
+In the end of the process, we return \opts which contains all the versions of \hist that are closest to \hist for all possible sizes of \hist.


 \subsubsection{Privacy-preserving option selection}
 \label{subsec:lmdk-opt-sel}

-\mk{WIP}
+The Algorithms of Section~\ref{subsec:lmdk-set-opts} return a set of possible versions of the original {\thething} set $L$ by adding extra timestamps in it from the series of events at timestamps $T \supseteq L$.
+In the next step of the process, we randomly select a set by utilizing the exponential mechanism (Section~\ref{subsec:prv-mech}).
+Prior to selecting a set, the exponential mechanism evaluates each set using a score function.

+One way evaluate each set is by taking into account the temporal position the events in the sequence.
 % Nearby events
 Events that occur at recent timestamps are more likely to reveal sensitive information regarding the users involved~\cite{kellaris2014differentially}.
 Thus, taking into account more recent events with respect to {\thethings} can result in less privacy loss and better privacy protection overall.
 This leads to worse data utility.
-
 % Depending on the {\thething} discovery technique
 The values of events near a {\thething} are usually similar to that of the latter.
 Therefore, privacy-preserving mechanisms are likely to approximate their values based on the nearest {\thething} instead of investing extra privacy budget to perturb their actual values; thus, spending less privacy budget.
 Saving privacy budget for releasing perturbed versions of actual event values can bring about better data utility. 
-
 % Distant events
-However, indicating the existence of randomized/dummy {\thethings} nearby actual {\thethings} can increase the adversarial confidence regarding the location of the latter within a series of events.
-Hence, choosing randomized/dummy {\thethings} far from the actual {\thethings} (and thus less relevant) can limit the final privacy loss.
+However, indicating the existence of dummy {\thethings} nearby actual {\thethings} can increase the adversarial confidence regarding the location of the latter within a series of events.
+Hence, choosing dummy {\thethings} far from the actual {\thethings} (and thus less relevant) can limit the final privacy loss.
+
+Another approach for the score function is to consider the number of events in each set.
+On the one hand, sets with more dummy {\thethings} may render actual {\thethings} more indistinguishable probabilistically.
+That is due to the fact that, it is harder for an adversary to pick a {\thething} when the ratio of {\thethings} to the size of the set gets lower.
+On the other hand, more dummy {\thethings} lead to distributing the privacy budget to more events, and therefore investing less at each timestamp.
+Thus, providing a better level of privacy protection.