diff --git a/code/lib/exp_mech.py b/code/lib/exp_mech.py index cd298fb..60bab2b 100644 --- a/code/lib/exp_mech.py +++ b/code/lib/exp_mech.py @@ -11,33 +11,34 @@ from matplotlib import pyplot as plt import time -''' - The scoring function. - - Parameters: - data - The data. - option - The option to evaluate. - Returns: - The score for the option. -''' def score(data, option): + ''' + The scoring function. + + Parameters: + data - The data. + option - The option to evaluate. + Returns: + The score for the option. + ''' return (option.sum() - data.sum()) + # return lmdk_lib.get_norm(data, option) -''' - The exponential mechanism. - - Parameters: - x - The data. - R - The possible outputs. - u - The scoring function. - delta - The sensitivity of the scoring function. - epsilon - The privacy budget. - Returns: - res - A randomly sampled output. - pr - The PDF of all possible outputs. -''' def exponential(x, R, u, delta, epsilon): + ''' + The exponential mechanism. + + Parameters: + x - The data. + R - The possible outputs. + u - The scoring function. + delta - The sensitivity of the scoring function. + epsilon - The privacy budget. + Returns: + res - A randomly sampled output. + pr - The PDF of all possible outputs. + ''' # Calculate the score for each element of R scores = [u(x, r) for r in R] # Normalize the scores between 0 and 1 diff --git a/rslt/bgt_cmp/Copenhagen-sel.pdf b/rslt/bgt_cmp/Copenhagen-sel.pdf index b81c1c1..9736b4d 100644 Binary files a/rslt/bgt_cmp/Copenhagen-sel.pdf and b/rslt/bgt_cmp/Copenhagen-sel.pdf differ diff --git a/rslt/bgt_cmp/HUE-sel.pdf b/rslt/bgt_cmp/HUE-sel.pdf index 5f75788..05a381e 100644 Binary files a/rslt/bgt_cmp/HUE-sel.pdf and b/rslt/bgt_cmp/HUE-sel.pdf differ diff --git a/rslt/bgt_cmp/T-drive-sel.pdf b/rslt/bgt_cmp/T-drive-sel.pdf new file mode 100644 index 0000000..1832b87 Binary files /dev/null and b/rslt/bgt_cmp/T-drive-sel.pdf differ diff --git a/text/bibliography.bib b/text/bibliography.bib index 896bf6d..0eb4fca 100644 --- a/text/bibliography.bib +++ b/text/bibliography.bib @@ -1761,6 +1761,15 @@ year = {2017} } +@inproceedings{meshgi2015expanding, + title={Expanding histogram of colors with gridding to improve tracking accuracy}, + author={Meshgi, Kourosh and Ishii, Shin}, + booktitle={2015 14th IAPR International Conference on Machine Vision Applications (MVA)}, + pages={475--479}, + year={2015}, + organization={IEEE} +} + @inproceedings{wang2017privacy, title = {Privacy Preserving Anonymity for Periodical SRS Data Publishing}, author = {Wang, Jie-Teng and Lin, Wen-Yang}, diff --git a/text/problem/theotherthing/main.tex b/text/problem/theotherthing/main.tex index 3bd7eb0..4dcab94 100644 --- a/text/problem/theotherthing/main.tex +++ b/text/problem/theotherthing/main.tex @@ -39,10 +39,16 @@ In Example~\ref{ex:lmdk-risk}, we demonstrate the extreme case of the applicatio \SetKwData{evalCur}{evalCur} \SetKwData{evalOrig}{evalOrig} \SetKwData{evalSum}{evalSum} +\SetKwData{h}{h} +\SetKwData{hi}{h$_i$} +\SetKwData{hist}{hist} +\SetKwData{histCur}{histCur} +\SetKwData{histTmp}{histTmp} \SetKwData{metricCur}{metricCur} \SetKwData{metricOrig}{metricOrig} \SetKwData{opt}{opt} \SetKwData{opti}{opt$_i$} +\SetKwData{opts}{opts} \SetKwData{optim}{optim} \SetKwData{optimi}{optim$_i$} \SetKwData{opts}{opts} @@ -51,7 +57,10 @@ In Example~\ref{ex:lmdk-risk}, we demonstrate the extreme case of the applicatio \SetKwFunction{calcMetric}{calcMetric} \SetKwFunction{evalSeq}{evalSeq} \SetKwFunction{getCombs}{getCombs} +\SetKwFunction{getDiff}{getDiff} +\SetKwFunction{getHist}{getHist} \SetKwFunction{getOpts}{getOpts} +\SetKwFunction{getNorm}{getNorm} \input{problem/theotherthing/contribution} \input{problem/theotherthing/problem} diff --git a/text/problem/theotherthing/solution.tex b/text/problem/theotherthing/solution.tex index bc970db..fe37676 100644 --- a/text/problem/theotherthing/solution.tex +++ b/text/problem/theotherthing/solution.tex @@ -42,16 +42,13 @@ It finds the option that is the most \emph{similar} to the original (Lines~{\ref % Evaluate the original \evalOrig $\leftarrow$ \evalSeq{$T, \emptyset, L$}\; - % Get all possible option combinations - \opts $\leftarrow$ \getOpts{$T, L$}\; - % Track the minimum (best) evaluation \diffMin $\leftarrow$ $\infty$\; % Track the optimal sequence (the one with the best evaluation) - \optim $\leftarrow$ $[]$\; + \opts $\leftarrow$ $[]$\; - \ForEach{\opt $\in$ \opts}{ \label{algo:lmdk-sel-opt-for-each} + \ForEach{\opt $\in$ \getOpts{$T, L$}}{ \label{algo:lmdk-sel-opt-for-each} \evalCur $\leftarrow 0$\; \ForEach{\opti $\in$ \opt}{ \evalCur $\leftarrow$ \evalCur $+$ \evalSeq{$T, \opti, L$}/\#\opt\; \label{algo:lmdk-sel-opt-comparison} @@ -60,10 +57,10 @@ It finds the option that is the most \emph{similar} to the original (Lines~{\ref \diffCur $\leftarrow \left|\evalCur - \evalOrig\right|$\; \If{\diffCur $<$ \diffMin}{ \diffMin $\leftarrow$ \diffCur\; - \optim $\leftarrow$ \opt\; + \opts $\leftarrow$ \opt\; } } \label{algo:lmdk-sel-opt-end} - \Return{\optim} + \Return{\opts} \end{algorithm} Algorithm~\ref{algo:lmdk-sel-opt} guarantees to return the optimal set of dummy {\thethings} with regard to the original set $L$. @@ -73,7 +70,7 @@ Next, we present a heuristic solution with improved time and space requirements. \paragraph{Heuristic} Algorithm~\ref{algo:lmdk-sel-heur}, follows an incremental methodology. -At each step it selects a new timestamp that corresponds to a regular ({non-\thething}) event from $T \setminus L$. +At each step it selects a new timestamp, that corresponds to a regular ({non-\thething}) event from $T \setminus L$, to create an option. \begin{algorithm} \caption{Heuristic dummy {\thething} set options selection} @@ -82,14 +79,14 @@ At each step it selects a new timestamp that corresponds to a regular ({non-\the \DontPrintSemicolon \KwData{$T, L$} - \KwResult{\optim} + \KwResult{\opts} \BlankLine % Evaluate the original \evalOrig $\leftarrow$ \evalSeq{$T, \emptyset, L$}\; % Get all possible option combinations - \optim $\leftarrow$ $[]$\; + \opts $\leftarrow$ $[]$\; $L' \leftarrow L$\; @@ -110,45 +107,111 @@ At each step it selects a new timestamp that corresponds to a regular ({non-\the \If{\diffCur $<$ \diffMin}{ \diffMin $\leftarrow$ \diffCur\; \optimi $\leftarrow$ \reg\; - }\label{algo:lmdk-sel-heur-comparison-end} + }\label{algo:lmdk-sel-heur-cmp-end} } % Save new point to landmarks $L'$.add(\optimi)\; % Add new option - \optim.append($L' \setminus L$)\; + \opts.append($L' \setminus L$)\; }\label{algo:lmdk-sel-heur-end} - \Return{\optim} + \Return{\opts} \end{algorithm} -Similar to Algorithm~\ref{algo:lmdk-sel-opt}, the selection is done based on a predefined metric (Lines~{\ref{algo:lmdk-sel-heur-comparison}-\ref{algo:lmdk-sel-heur-comparison-end}}). +Similar to Algorithm~\ref{algo:lmdk-sel-opt}, it selects new options based on a predefined metric (Lines~{\ref{algo:lmdk-sel-heur-comparison}-\ref{algo:lmdk-sel-heur-cmp-end}}). This process (Lines~{\ref{algo:lmdk-sel-heur-while}-\ref{algo:lmdk-sel-heur-end}}) goes on until we select a set that is equal to the size of the series of events, i.e.,~$L' = T$. -In terms of complexity: given $n$ regular events it requires $\mathcal{O}(n^2)$ time and space. +In terms of complexity, given $n$ regular events it requires $\mathcal{O}(n^2)$ time and space. Note that the reverse heuristic approach, i.e.,~starting with $T$ {\thethings} and removing until $L$, performs similarly with Algorithm~\ref{algo:lmdk-sel-heur}. +\paragraph{Partitioned} +We improve the complexity of Algorithm~\ref{algo:lmdk-sel-opt} by partitioning the {\thething} timestamp sequence $L$. +Algorithm~\ref{algo:lmdk-sel-hist}, \getHist generates a histogram from $L$ with bins of size \h. +We find \h by using the Freedman–Diaconis rule which is resilient to outliers and takes into account the data variability and data size~\cite{meshgi2015expanding}. +For every possible histogram version, the \getDiff function finds the difference between two histograms; for this operation we utilize the Euclidean distance~(see Section~\ref{subsec:sel-utl} for more details). -\mk{WIP: Histograms} +\begin{algorithm} + \caption{Partitioned dummy {\thething} set options selection} + \label{algo:lmdk-sel-hist} + + \DontPrintSemicolon + + \KwData{$T, L$} + \KwResult{\opts} + \BlankLine + + \hist, \h $\leftarrow$ \getHist{$T, L$}\; + + \histCur $\leftarrow$ hist\; + + \opts $\leftarrow$ $[]$\; + + \While{sum($L'$) $\neq$ len($T$)}{ \label{algo:lmdk-sel-hist-while} + % Track the minimum (best) evaluation + \diffMin $\leftarrow$ $\infty$\; + + % The candidate option + \opt $\leftarrow$ \histCur\; + + % Check every possibility + \ForEach{\hi \reg $L'$}{ \label{algo:lmdk-sel-hist-cmp-start} + + % Can we add one more point? + \If{\hi $+$ $1$ $\leq$ \h}{ + \histTmp $\leftarrow$ \histCur\; + \histTmp$[i]$ $\leftarrow$ \histTmp$[i]$ $+$ $1$\; + % Find difference from original + \diffCur $\leftarrow$ \getDiff{\hist, \histTmp}\; + + % Remember if it is the best that you've seen + \If{\diffCur $<$ \diffMin}{ \label{algo:lmdk-sel-hist-cmp} + \diffMin $\leftarrow$ \diffCur\; + \opt $\leftarrow$ \histTmp\; + } + + } + + } \label{algo:lmdk-sel-hist-cmp-end} + + % Update current histogram + \histCur $\leftarrow$ \opt\; + % Add current best to options + \opts $\leftarrow$ \opt\; + + } \label{algo:lmdk-sel-hist-end} + + \Return{\opts} +\end{algorithm} + +Between Lines~{\ref{algo:lmdk-sel-hist-cmp-start}-\ref{algo:lmdk-sel-hist-cmp-end}} we check every possible histogram version by incrementing each bin by $1$ and comparing it to the original (Line~\ref{algo:lmdk-sel-hist-cmp}). +In the end of the process, we return \opts which contains all the versions of \hist that are closest to \hist for all possible sizes of \hist. \subsubsection{Privacy-preserving option selection} \label{subsec:lmdk-opt-sel} -\mk{WIP} +The Algorithms of Section~\ref{subsec:lmdk-set-opts} return a set of possible versions of the original {\thething} set $L$ by adding extra timestamps in it from the series of events at timestamps $T \supseteq L$. +In the next step of the process, we randomly select a set by utilizing the exponential mechanism (Section~\ref{subsec:prv-mech}). +Prior to selecting a set, the exponential mechanism evaluates each set using a score function. +One way evaluate each set is by taking into account the temporal position the events in the sequence. % Nearby events Events that occur at recent timestamps are more likely to reveal sensitive information regarding the users involved~\cite{kellaris2014differentially}. Thus, taking into account more recent events with respect to {\thethings} can result in less privacy loss and better privacy protection overall. This leads to worse data utility. - % Depending on the {\thething} discovery technique The values of events near a {\thething} are usually similar to that of the latter. Therefore, privacy-preserving mechanisms are likely to approximate their values based on the nearest {\thething} instead of investing extra privacy budget to perturb their actual values; thus, spending less privacy budget. Saving privacy budget for releasing perturbed versions of actual event values can bring about better data utility. - % Distant events -However, indicating the existence of randomized/dummy {\thethings} nearby actual {\thethings} can increase the adversarial confidence regarding the location of the latter within a series of events. -Hence, choosing randomized/dummy {\thethings} far from the actual {\thethings} (and thus less relevant) can limit the final privacy loss. +However, indicating the existence of dummy {\thethings} nearby actual {\thethings} can increase the adversarial confidence regarding the location of the latter within a series of events. +Hence, choosing dummy {\thethings} far from the actual {\thethings} (and thus less relevant) can limit the final privacy loss. + +Another approach for the score function is to consider the number of events in each set. +On the one hand, sets with more dummy {\thethings} may render actual {\thethings} more indistinguishable probabilistically. +That is due to the fact that, it is harder for an adversary to pick a {\thething} when the ratio of {\thethings} to the size of the set gets lower. +On the other hand, more dummy {\thethings} lead to distributing the privacy budget to more events, and therefore investing less at each timestamp. +Thus, providing a better level of privacy protection.