diff --git a/tables/micro.tex b/tables/micro.tex index b61f711..a7d912a 100644 --- a/tables/micro.tex +++ b/tables/micro.tex @@ -81,8 +81,12 @@ \hyperlink{ye2017trajectory}{\textbf{Ye et al.}} & infinite & streaming & global & event & linkage & generalization & $l$-diversity \\ \cite{ye2017trajectory} & (sequential) & & & & & & \\ \hdashline - \hyperlink{cao2017quantifying}{\textbf{Cao et al.}} & finite/ & streaming & global & user/ & dependence & perturbation & differential \\ - \cite{cao2017quantifying,cao2018quantifying} & infinite & & & ($w$-)event & (temporal) & (Laplace) & privacy \\ + \hyperlink{cao2017quantifying}{\textbf{Cao et al.}} & infinite & streaming & global & user/ & dependence & perturbation & differential \\ + \cite{cao2017quantifying,cao2018quantifying} & & & & ($w$-)event & (temporal) & (Laplace) & privacy \\ \hdashline + + \hyperlink{naim2019off}{\emph{ON-OFF privacy}} & infinite & streaming & local & event & dependence & randomization & - \\ + \cite{naim2019off, ye2019preserving} & & & & & (serial) & & \\ + \cite{ye2020off, ye2021off} & & & & & & & \\ \bottomrule diff --git a/tables/statistical.tex b/tables/statistical.tex index 06a63db..75ddd2e 100644 --- a/tables/statistical.tex +++ b/tables/statistical.tex @@ -76,7 +76,13 @@ \cite{li2007hiding} & & & & & (serial) & & \\ \hdashline \hyperlink{chen2017pegasus}{\emph{PeGaSus}} & infinite & streaming & global & event & linkage & perturbation & differential \\ - \cite{chen2017pegasus} & & & & & & (Laplace) & privacy \\ + \cite{chen2017pegasus} & & & & & & (Laplace) & privacy \\ \hdashline + + \hyperlink{farokhi2020temporally}{Farokhi} & infinite & streaming & global & - & linkage & perturbation & differential \\ + \cite{farokhi2020temporally} & & & & & & (Laplace) & privacy \\ \hdashline + + \hyperlink{ma2019real}{\textbf{\emph{RPTR}}} & infinite & streaming & global & $w$-event & linkage & perturbation & differential \\ + \cite{ma2019real} & & & & & & (Laplace) & privacy \\ \bottomrule diff --git a/text/acknowledgements.tex b/text/acknowledgements.tex index b1a9fc4..d1f2dd5 100644 --- a/text/acknowledgements.tex +++ b/text/acknowledgements.tex @@ -1,10 +1,10 @@ \chapter{Acknowledgements} \label{ch:ack} - +\mk{WIP} Upon the completion of my thesis, I would like to express my deep gratitude to my research supervisors for their patient guidance, enthusiastic encouragement and useful critiques of this research work. -Besides my advisors, I would like to thank the reporters as well as the rest of the jury for their invaluable contribution. +I would also like to thank the reporters for their feedback, comments, and time. -\kat{the jury and the reporters do not contribute; thank them for their feedback, comments and time} +% \kat{the jury and the reporters do not contribute; thank them for their feedback, comments and time} A special thanks to my department’s faculty, staff and fellow researchers for their valuable assistance whenever needed and for creating a pleasant and creative environment during my studies. diff --git a/text/bibliography.bib b/text/bibliography.bib index a1454df..17ae1bf 100644 --- a/text/bibliography.bib +++ b/text/bibliography.bib @@ -1180,6 +1180,17 @@ publisher = {IEEE} } +@article{ma2019real, + title = {Real-time privacy-preserving data release over vehicle trajectory}, + author = {Ma, Zhuo and Zhang, Tian and Liu, Ximeng and Li, Xinghua and Ren, Kui}, + journal = {IEEE transactions on vehicular technology}, + volume = {68}, + number = {8}, + pages = {8091--8102}, + year = {2019}, + publisher = {IEEE} +} + @inproceedings{machanavajjhala2006diversity, title = {l-diversity: Privacy beyond k-anonymity}, author = {Machanavajjhala, Ashwin and Gehrke, Johannes and Kifer, Daniel and Venkitasubramaniam, Muthuramakrishnan}, @@ -1267,6 +1278,15 @@ publisher = {Now Publishers, Inc.} } +@inproceedings{naim2019off, + title = {ON-OFF privacy with correlated requests}, + author = {Naim, Carolina and Ye, Fangwei and El Rouayheb, Salim}, + booktitle = {2019 IEEE International Symposium on Information Theory (ISIT)}, + pages = {817--821}, + year = {2019}, + organization = {IEEE} +} + @inproceedings{narayanan2008robust, title = {Robust de-anonymization of large sparse data sets}, author = {Narayanan, Arvind and Shmatikov, Vitaly}, @@ -1385,6 +1405,7 @@ publisher = {Cambridge university press} } +% new algorithm @misc{russell2018fitness, title = {Fitness app {Strava} exposes the location of military bases}, author = {Russell, Jon}, @@ -1402,7 +1423,6 @@ organization = {IEEE} } -% new algorithm @article{satyanarayanan2017emergence, title = {The emergence of edge computing}, author = {Satyanarayanan, Mahadev}, @@ -1664,6 +1684,16 @@ organization = {IEEE} } +@article{wang2021current, + title = {Why current differential privacy schemes are inapplicable for correlated data publishing?}, + author = {Wang, Hao and Xu, Zhengquan and Jia, Shan and Xia, Ying and Zhang, Xu}, + journal = {World Wide Web}, + volume = {24}, + pages = {1--23}, + year = {2021}, + publisher = {Springer} +} + @article{warner1965randomized, title = {Randomized response: A survey technique for eliminating evasive answer bias}, author = {Warner, Stanley L}, @@ -1799,6 +1829,32 @@ organization = {IEEE} } +@inproceedings{ye2019preserving, + title = {Preserving ON-OFF privacy for past and future requests}, + author = {Ye, Fangwei and Naim, Carolina and El Rouayheb, Salim}, + booktitle = {2019 IEEE Information Theory Workshop (ITW)}, + pages = {1--5}, + year = {2019}, + organization = {IEEE} +} + +@article{ye2020off, + title = {ON-OFF Privacy in the Presence of Correlation}, + author = {Ye, Fangwei and Naim, Carolina and Rouayheb, Salim El}, + journal = {arXiv preprint arXiv:2004.04186}, + year = {2020} +} + +@article{ye2021off, + title = {ON-OFF Privacy Against Correlation Over Time}, + author = {Ye, Fangwei and Naim, Carolina and El Rouayheb, Salim}, + journal = {IEEE Transactions on Information Forensics and Security}, + volume = {16}, + pages = {2104--2117}, + year = {2021}, + publisher = {IEEE} +} + @inproceedings{yuan2010t, title = {T-drive: driving directions based on taxi trajectories}, author = {Yuan, Jing and Zheng, Yu and Zhang, Chengyang and Xie, Wenlei and Xie, Xing and Sun, Guangzhong and Huang, Yan}, diff --git a/text/introduction/main.tex b/text/introduction/main.tex index c8aaa74..ec84715 100644 --- a/text/introduction/main.tex +++ b/text/introduction/main.tex @@ -68,18 +68,5 @@ Typically, in such cases, we have a collection of data referring to the same ind Additionally, in many cases, the privacy-preserving processes should take into account implicit correlations and restrictions that exist, e.g.,~space-imposed collocation or movement restrictions. Since these data are related to most of the important applications and services that enjoy high utilization rates, privacy-preserving continuous data publishing becomes one of the emblematic problems of our time. -To accompany and facilitate the descriptions in this chapter, we provide the following running example. - -\begin{example} - \label{ex:snapshot} - Users interact with an LBS by making queries in order to retrieve some useful location-based information or just reporting user-state at various locations. - This user--LBS interaction generates user-related data, organized in a schema with the following attributes: \emph{Name} (the unique identifier of the table), \emph{Age}, \emph{Location}, and \emph{Status} (Table~\ref{tab:snapshot-micro}). - The `Status' attribute includes information that characterizes the user's state or the query itself, and its value varies according to the service functionality. - Subsequently, the generated data are aggregated (by issuing count queries over them) in order to derive useful information about the popularity of the venues during the day (Table~\ref{tab:snapshot-statistical}). - - \includetable{snapshot} - -\end{example} - \input{introduction/contribution} \input{introduction/structure} diff --git a/text/preliminaries/correlation.tex b/text/preliminaries/correlation.tex index a33f15b..cf01257 100644 --- a/text/preliminaries/correlation.tex +++ b/text/preliminaries/correlation.tex @@ -60,6 +60,10 @@ A negative value shows that the behavior of one variable is the \emph{opposite} Zero means that the variables are not linked and are \emph{independent} of each other. A positive correlation indicates that the variables behave in a \emph{similar} manner, e.g.,~when the one decreases the other decreases as well. +Wand et al.~\cite{wang2021current} examined why current differential privacy methods that either increase the noise size to offset the privacy leakage caused by the correlation (model-based) or transform correlated data into independent series to another domain and process them independently (transform-based) are inapplicable for correlated data publishing. +They prove that the privacy distortion, which they quantify using entropy, after filtering out the independent and identically distributed noise from the correlated data by utilizing the data correlation (correlation-distinguishability attack) is equal to that of conditional probability inference. +They conclude that the problem stems from the difference of correlation between the noise that the current methods inject and the output data. + \subsection{Privacy loss under temporal correlation} \label{subsec:cor-temp} diff --git a/text/preliminaries/data.tex b/text/preliminaries/data.tex index b6fa4a0..c3f2add 100644 --- a/text/preliminaries/data.tex +++ b/text/preliminaries/data.tex @@ -13,6 +13,19 @@ We firstly classify data based on their content \kat{'based on their content' re \item \emph{Statistical data}---the outcome of statistical processes on microdata. \end{itemize} +To accompany and facilitate the descriptions in this chapter, we provide the following running example. + +\begin{example} + \label{ex:snapshot} + Users interact with an LBS by making queries in order to retrieve some useful location-based information or just reporting user-state at various locations. + This user--LBS interaction generates user-related data, organized in a schema with the following attributes: \emph{Name} (the unique identifier of the table), \emph{Age}, \emph{Location}, and \emph{Status} (Table~\ref{tab:snapshot-micro}). + The `Status' attribute includes information that characterizes the user's state or the query itself, and its value varies according to the service functionality. + Subsequently, the generated data are aggregated (by issuing count queries over them) in order to derive useful information about the popularity of the venues during the day (Table~\ref{tab:snapshot-statistical}). + + \includetable{snapshot} + +\end{example} + \kat{I miss the definition of data. You speak of data items, data values, what is the difference to data?} An example of microdata is displayed in Table~\ref{tab:snapshot-micro}, while an example of statistical data in Table~\ref{tab:snapshot-statistical}. Data, in either of these two forms, may have a special property called~\emph{continuity}, i.e.,~their values change and can be observed through time. \kat{The way that you define it here reminds temporal data. What is the difference?} diff --git a/text/preliminaries/main.tex b/text/preliminaries/main.tex index 114837e..2201564 100644 --- a/text/preliminaries/main.tex +++ b/text/preliminaries/main.tex @@ -1,13 +1,20 @@ \chapter{Preliminaries} \label{ch:prel} -\kat{mention also the different ways data are organized, e.g., as tuples in tables, KVs, graphs, etc and in what formats you consider them in this work.} -In this chapter, we introduce some relevant terminology and information around the problem of continuous publishing of privacy-sensitive data sets \kat{the title of the thesis is '..in user generated big data' not in 'continuous publishing'. Consider rephrase here, and if needed position the user generated big data w.r.t. the continuous publishing so that you continue later on discussing for the continuous publishing setting. }. -First, in Section~\ref{sec:data}, we categorize user-generated data sets and review data processing in the context of continuous data publishing. +% \kat{mention also the different ways data are organized, e.g., as tuples in tables, KVs, graphs, etc and in what formats you consider them in this work.} +In this chapter, we introduce some relevant terminology and information around the problem of +quality and privacy in user-generated Big Data with a special focus on continuous data publishing. +% continuous publishing of privacy-sensitive data sets +% \kat{the title of the thesis is '..in user generated big data' not in 'continuous publishing'. Consider rephrase here, and if needed position the user generated big data w.r.t. the continuous publishing so that you continue later on discussing for the continuous publishing setting. } +First, in Section~\ref{sec:data}, we categorize user-generated data sets, that we consider in a tabular form, and review data processing in the context of continuous data publishing. Second, in Section~\ref{sec:privacy}, we define information disclosure in data privacy. Thereafter, we list the categories of privacy attacks, %identified in the literature, -the possible privacy protection levels, the fundamental privacy operations that are applied to achieve data privacy, and finally we provide a brief overview of the \kat{also here reconsider the term seminal, so as it does not read like we are in the related work section} seminal works on privacy-preserving data publishing. -\kat{The correlations are not intuitively connected to privacy, so put here a linking sentence to data privacy.} -Third, in Section~\ref{sec:correlation}, we discuss the different types of correlation, we document ways to extract data dependence from continuous data, and we investigate the privacy risks that data correlation entails with special focus on the privacy loss under temporal correlation. +the possible privacy protection levels, the fundamental privacy operations that are applied to achieve data privacy, and finally we provide a brief overview of the +% \kat{also here reconsider the term seminal, so as it does not read like we are in the related work section} +% seminal works on privacy-preserving data publishing. +basic notions for data privacy protection. +% \kat{The correlations are not intuitively connected to privacy, so put here a linking sentence to data privacy.} +Third, in Section~\ref{sec:correlation}, we focus on the impact of correlation on data privacy. +More particularly, we discuss the different types of correlation, we document ways to extract data correlation from continuous data, and we investigate the privacy risks that data correlation entails with special focus on the privacy loss under temporal correlation. \input{preliminaries/data} \input{preliminaries/privacy} diff --git a/text/preliminaries/privacy.tex b/text/preliminaries/privacy.tex index 5de9d03..1fbc5d0 100644 --- a/text/preliminaries/privacy.tex +++ b/text/preliminaries/privacy.tex @@ -51,8 +51,11 @@ In order to better protect the privacy of Donald in case of attacks, the data sh \subsection{Levels of privacy protection} \label{subsec:prv-levels} -The information disclosure that a data release may entail is linked to the protection level that indicates \emph{what} a privacy-preserving algorithm is trying to achieve.\kat{I don't understand this first sentence} -More specifically, in continuous data publishing we consider the privacy protection level with respect to not only the users, but also to the \emph{events} occurring in the data. +% The information disclosure that a data release may entail is linked to the protection level that indicates \emph{what} a privacy-preserving algorithm is trying to achieve. +% \kat{I don't understand this first sentence} +% \mk{Same here...} +% More specifically, i +In continuous data publishing we consider the privacy protection level with respect to not only the users, but also to the \emph{events} occurring in the data. An event is a pair of an identifying attribute of an individual and the sensitive data (including contextual information) and we can see it as a correspondence to a record in a database, where each individual may participate once. Data publishers typically release events in the form of sequences of data items, usually indexed in time order (time series) and geotagged, e.g.,~(`Dewey', `at home at Montmartre at $t_1$'), \dots, (`Quackmore', `dining at Opera at $t_1$'). We use the term `users' to refer to the \emph{individuals}, also known as \emph{participants}, who are the source of the processed and published data. @@ -61,9 +64,13 @@ Users are subject to privacy attacks, and thus are the main point of interest of In more detail, the privacy protection levels are: \begin{enumerate}[(a)] - \item \emph{Event}~\cite{dwork2010differential, dwork2010pan}---limits the privacy protection to \emph{any single event} in a time series, providing maximum \kat{maximum? better say high} data utility. + \item \emph{Event}~\cite{dwork2010differential, dwork2010pan}---limits the privacy protection to \emph{any single event} in a time series, providing high + % \kat{maximum? better say high} + data utility. \item \emph{$w$-event}~\cite{kellaris2014differentially}---provides privacy protection to \emph{any sequence of $w$ events} in a time series. - \item \emph{User}~\cite{dwork2010differential, dwork2010pan}---protects \emph{all the events} in a time series, providing maximum\kat{maximum? better say high} privacy protection. + \item \emph{User}~\cite{dwork2010differential, dwork2010pan}---protects \emph{all the events} in a time series, providing high + % \kat{maximum? better say high} + privacy protection. \end{enumerate} Figure~\ref{fig:prv-levels} demonstrates the application of the possible protection levels on the statistical data of Example~\ref{ex:continuous}. @@ -71,6 +78,7 @@ For instance, in event-level (Figure~\ref{fig:level-event}) it is hard to determ Moreover, in user-level (Figure~\ref{fig:level-user}) it is hard to determine whether Quackmore was ever included in the released series of events at all. Finally, in $2$-event-level (Figure~\ref{fig:level-w-event}) it is hard to determine whether Quackmore was ever included in the released series of events between the timestamps $t_1$ and $t_2$, $t_2$ and $t_3$, etc. (i.e.,~for a window $w = 2$). \kat{Already, by looking at the original counts, for the reader it is hard to see if Quackmore was in the event/database. So, we don't really get the difference among the different levels here.} +\mk{It is without background knowledge.} \begin{figure}[htp] \centering @@ -83,7 +91,10 @@ Finally, in $2$-event-level (Figure~\ref{fig:level-w-event}) it is hard to deter \subcaptionbox{$2$-event-level\label{fig:level-w-event}}{% \includegraphics[width=.32\linewidth]{level-w-event}% }\hspace{\fill} - \caption{Protecting the data of Table~\ref{tab:continuous-statistical} on (a)~event-, (b)~user-, and (c)~$2$-event-level. A suitable distortion method can be applied accordingly. \kat{Why don't you distort the results already in this table?}} + \caption{Protecting the data of Table~\ref{tab:continuous-statistical} on (a)~event-, (b)~user-, and (c)~$2$-event-level. A suitable distortion method can be applied accordingly. + % \kat{Why don't you distort the results already in this table?} + % \mk{Because we've not discussed yet about these operations.} + } \label{fig:prv-levels} \end{figure} @@ -97,13 +108,30 @@ Although the described levels have been coined in the context of \emph{different \subsection{Privacy-preserving operations} \label{subsec:prv-operations} -Protecting private information, which is known by many names (obfuscation, cloaking, anonymization, etc.\kat{the techniques are not equivalent, so it is correct to say that they are different names for the same thing}), is achieved by using a specific basic \kat{but later you mention several ones.. so what is the specific basic one ?}privacy protection operation. -Depending on the intervention\kat{?, technique, algorithm, method, operation, intervention.. we are a little lost with the terminology and the difference among all these } that we choose to perform on the original data, we identify the following operations:\kat{you can mention that the different operations have different granularity} +Protecting private information +% , which is known by many names (obfuscation, cloaking, anonymization, etc.), +% \kat{the techniques are not equivalent, so it is correct to say that they are different names for the same thing} +is achieved by using a specific basic +% \kat{but later you mention several ones.. so what is the specific basic one ?} +privacy protection operation. +Depending on the +technique +% intervention +% \kat{?, technique, algorithm, method, operation, intervention.. we are a little lost with the terminology and the difference among all these } +that we choose to perform on the original data, we identify the following operations: +% \kat{you can mention that the different operations have different granularity} +% \mk{``granularity''?} \begin{itemize} - \item \emph{Aggregation}---group\kat{or combine? also maybe mention that the single value will replace the values of a specific attribute of these rows} together multiple rows of a data set to form a single value. + \item \emph{Aggregation}---combine + % group + % \kat{or combine? also maybe mention that the single value will replace the values of a specific attribute of these rows} + % together + multiple rows of a data set to form a single value which will replace these rows. \item \emph{Generalization}---replace an attribute value with a parent value in the attribute taxonomy (when applicable). - Notice that a step of generalization, may be followed by a step of \emph{specialization}, to improve the quality of the resulting data set.\kat{This technical detail is not totally clear at this point. Either elaborate or remove.} + % Notice that a step of generalization, may be followed by a step of \emph{specialization}, to improve the quality of the resulting data set. + % \kat{This technical detail is not totally clear at this point. Either elaborate or remove.} + % \mk{I cannot remember coming across it in the literature.} \item \emph{Suppression}---delete completely certain sensitive values or entire records. \item \emph{Perturbation}---disturb the initial attribute value in a deterministic or probabilistic way. The probabilistic data distortion is referred to as \emph{randomization}. @@ -114,15 +142,20 @@ If we want to protect the \emph{Age} of the user by aggregation, we may replace It is worth mentioning that there is a series of algorithms (e.g.,~\cite{benaloh2009patient, kamara2010cryptographic, cao2014privacy}) based on the \emph{cryptography} operation. However, the majority of these methods, among other assumptions that they make, have minimum or even no trust to the entities that handle the personal information. -Furthermore, the amount and the way of data processing of these techniques usually burden the overall procedure, deteriorate the utility of the resulting data sets, and restrict their applicability.\kat{All these points apply also to the non-cryptography techniques. So you should mostly point out that they do not only deteriorate the utility but make them non-usable at all.} +Furthermore, the amount and the way of data processing of these techniques usually burden the overall procedure, deteriorate the utility of the resulting data sets to a point where they are completely useless, and restrict their applicability. +% \kat{All these points apply also to the non-cryptography techniques. So you should mostly point out that they do not only deteriorate the utility but make them non-usable at all.} Our focus is limited to techniques that achieve a satisfying balance between both participants' privacy and data utility. -For these reasons, there will be no further discussion around this family of techniques in this article.\kat{sentence that fitted in the survey but not in the thesis so replace with a more pertinent comment} +% For these reasons, there will be no further discussion around this family of techniques in this article. +% \kat{sentence that fitted in the survey but not in the thesis so replace with a more pertinent comment} \subsection{Basic notions for privacy protection} \label{subsec:prv-seminal} -For completeness, in this section we present the seminal works for privacy-preserving data publishing, which, even though originally designed for the snapshot publishing scenario \kat{was dp designed for the snapshot publishing scenario?}, have paved the way, since many of the works in privacy-preserving continuous publishing are based on or extend them. +For completeness, in this section we present the seminal works for privacy-preserving data publishing, which, even though originally designed for the snapshot publishing scenario, +% \kat{was dp designed for the snapshot publishing scenario?} +% \mk{Not clearly but yes. We can write it since DP was coined in 2006, while DP under continual observation came later in 2010.} +have paved the way, since many of the works in privacy-preserving continuous publishing are based on or extend them. \subsubsection{Microdata} @@ -131,7 +164,12 @@ For completeness, in this section we present the seminal works for privacy-prese Sweeney coined \emph{$k$-anonymity}~\cite{sweeney2002k}, one of the first established works on data privacy. A released data set features $k$-anonymity protection when the sequence of values for a set of identifying attributes, called the \emph{quasi-identifiers}, is the same for at least $k$ records in the data set. Computing the quasi-identifiers in a set of attributes is still a hard problem on its own~\cite{motwani2007efficient}. -$k$-anonymity is syntactic\kat{meaning?}, it constitutes an individual indistinguishable from at least $k-1$ other individuals in the same data set.\kat{you just said this in another way,two sentences before} +% $k$-anonymity +% is syntactic, +% \kat{meaning?} +% it +% constitutes an individual indistinguishable from at least $k-1$ other individuals in the same data set. +% \kat{you just said this in another way,two sentences before} In a follow-up work~\cite{sweeney2002achieving}, the author describes a way to achieve $k$-anonymity for a data set by the suppression or generalization of certain values of the quasi-identifiers. Several works identified and addressed privacy concerns on $k$-anonymity. Machanavajjhala et al.~\cite{machanavajjhala2006diversity} pointed out that $k$-anonymity is vulnerable to homogeneity and background knowledge attacks. @@ -146,17 +184,26 @@ A data set features $\theta$-closeness when all of its groups satisfy $\theta$- The main drawback of $k$-anonymity (and its derivatives) is that it is not tolerant to external attacks of re-identification on the released data set. The problems identified in~\cite{sweeney2002k} appear when attempting to apply $k$-anonymity on continuous data publishing (as we will also see next in Section~\ref{sec:micro}). These attacks include multiple $k$-anonymous data set releases with the same record order, subsequent releases of a data set without taking into account previous $k$-anonymous releases, and tuple updates. -Proposed solutions include rearranging the attributes, setting the whole attribute set of previously released data sets as quasi-identifiers or releasing data based on previous $k$-anonymous releases.\kat{and the citations of these solutions?} +Proposed solutions include rearranging the attributes, setting the whole attribute set of previously released data sets as quasi-identifiers or releasing data based on previous $k$-anonymous releases~\cite{simi2017extensive}. +% \kat{and the citations of these solutions?} \subsubsection{Statistical data} \label{subsec:prv-statistical} -While methods based on $k$-anonymity have been mainly employed for releasing microdata, \emph{differential privacy}~\cite{dwork2006calibrating} has been proposed for releasing high utility aggregates over microdata while providing semantic\kat{semantic ?} privacy guarantees. -Differential privacy is algorithmic \kat{algorithmic? moreover, you repeat this sentence later on, after the definition of neighboring datasets}, it ensures that any adversary observing a privacy-protected output, no matter his/her computational power or auxiliary information, cannot conclude with absolute certainty if an individual is included in the input data set. +While methods based on $k$-anonymity have been mainly employed for releasing microdata, \emph{differential privacy}~\cite{dwork2006calibrating} has been proposed for releasing high utility aggregates over microdata while providing semantic +% \kat{semantic ?} +privacy guarantees that characterize the output data. +Differential privacy is algorithmic, +% \kat{algorithmic? moreover, you repeat this sentence later on, after the definition of neighboring datasets} +it characterizes the data publishing process which passes its privacy guarantee to the resulting data. +It ensures that any adversary observing a privacy-protected output, no matter their computational power or auxiliary information, cannot conclude with absolute certainty if an individual is included in the input data set. Moreover, it quantifies and bounds the impact that the addition/removal of an individual to/from a data set has on the derived privacy-protected aggregates thereof. +More precisely, differential privacy quantifies the impact of the addition/removal of a single tuple in $D$ on the output $\pmb{o}$ of a privacy mechanism $\mathcal{M}$. +% \kat{what is M?} +The distribution of all $\pmb{o}$, in some range $\mathcal{O}$, is not affected \emph{substantially}, i.e.,~it changes only slightly due to the modification of any one tuple in all possible $D \in \mathcal{D}$. -\kat{introduce the following definition, and link it to the text before. Maybe you can put the definition after the following paragraph.} +% \kat{introduce the following definition, and link it to the text before. Maybe you can put the definition after the following paragraph.} \begin{definition} [Neighboring data sets] @@ -164,10 +211,13 @@ Moreover, it quantifies and bounds the impact that the addition/removal of an in Two data sets are neighboring (or adjacent) when they differ by at most one tuple, i.e.,~one can be obtained by adding/removing the data of an individual to/from the other. \end{definition} -More precisely, differential privacy quantifies the impact of the addition/removal of a single tuple in $D$ on the output $\pmb{o}$ of $\mathcal{M}$. \kat{what is M?} -The distribution of all $\pmb{o}$, in some range $\mathcal{O}$, is not affected \emph{substantially}, i.e.,~it changes only slightly due to the modification of any one tuple in all possible $D \in \mathcal{D}$. -Thus, differential privacy is algorithmic\kat{??}, it ensures that any adversary observing any $\pmb{o}$ cannot conclude with absolute certainty whether or not any individual is included in any $D$. -Its performance is irrelevant to the computational power and auxiliary information available to an adversary observing the outputs of $\mathcal{M}$.\kat{you already said this. Moreover, it is irrelevant to the neighboring datasets and thus does not fit here..} +% Thus, differential privacy +% is algorithmic, +% \kat{??} +% it +% ensures that any adversary observing any $\pmb{o}$ cannot conclude with absolute certainty whether or not any individual is included in any $D$. +% Its performance is irrelevant to the computational power and auxiliary information available to an adversary observing the outputs of $\mathcal{M}$. +% \kat{you already said this. Moreover, it is irrelevant to the neighboring datasets and thus does not fit here..} \begin{definition} [Differential privacy] @@ -176,16 +226,39 @@ Its performance is irrelevant to the computational power and auxiliary informati $$\Pr[\mathcal{M}(D) \in O] \leq e^\varepsilon \Pr[\mathcal{M}(D') \in O]$$ \end{definition} -\noindent $\Pr[\cdot]$ denotes the probability of $\mathcal{M}$ generating $\pmb{o}$ \kat{there is no o in the definition above} as output from $O \subseteq \mathcal{O}$, when given $D$ as input. +\noindent $\Pr[\cdot]$ denotes the probability of $\mathcal{M}$ generating an output +% $\pmb{o}$ +% \kat{there is no o in the definition above} +% as output +from all possible $O \subseteq \mathcal{O}$, when given $D$ as input. The \emph{privacy budget} $\varepsilon$ is a positive real number that represents the user-defined privacy goal~\cite{mcsherry2009privacy}. -As the definition implies, $\mathcal{M}$ achieves stronger privacy protection for lower values of $\varepsilon$ since the probabilities of $D$ and $D'$ being true worlds are similar, but the utility of $\pmb{o}$ \kat{there is no o in the definition above} is reduced since more randomness is introduced by $\mathcal{M}$. +As the definition implies, $\mathcal{M}$ achieves stronger privacy protection for lower values of $\varepsilon$ since the probabilities of $D$ and $D'$ being true worlds are similar, but the utility of tje output +% $\pmb{o}$ +% \kat{there is no o in the definition above} +is reduced since more randomness is introduced by $\mathcal{M}$. The privacy budget $\varepsilon$ is usually set to $0.01$, $0.1$, or, in some cases, $\ln2$ or $\ln3$~\cite{lee2011much}. -Its local variant~\cite{duchi2013local} is compatible with microdata, where $D$ is composed of a single data item and is represented by $x$.\kat{Seems out of place and needs to be described a little more..} +% Its local variant~\cite{duchi2013local} is compatible with microdata, where $D$ is composed of a single data item and is represented by $x$.\kat{Seems out of place and needs to be described a little more..} +% We refer the interested reader to~\cite{desfontaines2020sok} for a systematic taxonomy of the different variants and extensions of differential privacy. -We refer the interested reader to~\cite{desfontaines2020sok} for a systematic taxonomy of the different variants and extensions of differential privacy. +The applicability +% pertinence +% \kat{pertinence to what?} +of differential privacy mechanisms is inseparable from the query's +% \kat{here, you need to associate a mechanism M to the query, because so far you have been talking for mechanisms} +function sensitivity. +The presence/absence of a single record should only change the result slightly, +% \kat{do you want to say 'should' and not 'can'?} +and therefore differential privacy methods are best for low sensitivity queries such as counts. +However, sum, max, and in some cases average +% \kat{and average } +queries can be problematic since a single (but outlier) value could change the output noticeably, making it necessary to add a lot of noise to the query's answer. +<<<<<<< HEAD \kat{introduce and link to the previous text the following definition} +======= +% \kat{introduce and link to the previous text the following definition } +>>>>>>> 744bed7ac1bc6669742b970ea6f0f399200db538 \begin{definition} [Query function sensitivity] @@ -194,12 +267,15 @@ We refer the interested reader to~\cite{desfontaines2020sok} for a systematic ta $$\Delta f = \max_{D, D' \in \mathcal{D}} \lVert {f(D) - f(D')} \rVert_{1}$$ \end{definition} +<<<<<<< HEAD The pertinence \kat{pertinence to what?} of differential privacy methods is inseparable from the query's \kat{here, you need to associate a mechanism M to the query, because so far you have been talking for mechanisms} function sensitivity. The presence/absence of a single record can only change the result slightly\kat{do you want to say 'should' and not 'can'?}, and therefore differential privacy methods are best for low sensitivity queries such as counts. However, sum and max \kat{and average } queries can be problematic since a single (but outlier) value could change the output noticeably, making it necessary to add a lot of noise to the query's answer. \kat{How does the following connects to the query's sensitivity?}Furthermore, asking a series of queries may allow the disambiguation between possible data sets, making it necessary to add even more noise to the outputs. For this reason, after a series of queries exhausts the available privacy budget \kat{you have not talked about the sequential theorem, so this comes out of the blue} the data set has to be discarded. \kat{THe following is an explanation of the previous. When you restate sth in different words for explanation, please say that you do so, otherwise it is not clear what new you want to convey.}Keeping the original guarantee across multiple queries that return different/new answers \kat{why only different?even the same query multiple times would have the same results} requires the injection of noise proportional to the number of the executed queries, and thus destroying the utility of the output. +======= +>>>>>>> 744bed7ac1bc6669742b970ea6f0f399200db538 \paragraph{Privacy mechanisms} \label{subsec:prv-mech} @@ -269,6 +345,14 @@ Generally, when we apply a series of independent (i.e.,~in the way that they inj The privacy guarantee of $m \in \mathbb{Z}^+$ independent privacy mechanisms, satisfying $\varepsilon_1$-, $\varepsilon_2$-, \dots, $\varepsilon_m$-differential privacy respectively, when applied over the same data set equals to $\sum_{i = 1}^m \varepsilon_i$. \end{theorem} +% \kat{How does the following connects to the query's sensitivity?} +Asking a series of queries may allow the disambiguation between possible data sets, making it necessary to add even more noise to the outputs. +% \kat{The following is an explanation of the previous. When you restate sth in different words for explanation, please say that you do so, otherwise it is not clear what new you want to convey.} +Keeping the original guarantee across multiple queries that require different/new answers requires the injection of noise proportional to the number of the executed queries, and thus destroying the utility of the output. +For this reason, after a series of queries exhausts the available privacy budget +% \kat{you have not talked about the sequential theorem, so this comes out of the blue} +the data set has to be discarded. + Notice that the sequential composition corresponds to the worst case scenario where each time we use a mechanism we have to invest some (or all) of the available privacy budget. In the special case that we query disjoint data sets, we can take advantage of the \emph{parallel} composition property~\cite{mcsherry2009privacy, soria2016big}, and thus spare some of the available privacy budget. diff --git a/text/related/micro.tex b/text/related/micro.tex index 27ed3c9..8c3704d 100644 --- a/text/related/micro.tex +++ b/text/related/micro.tex @@ -416,3 +416,22 @@ According to the technique's intuition, stronger correlations result in higher p However, the loss is smaller when the dimension of the transition matrix, which is extracted according to the modeling of the correlations (here it is Markov chain), is larger due to the fact that larger transition matrices tend to be uniform, resulting in weaker data dependence. The authors investigate briefly all of the possible privacy levels; however, the solutions that they propose are suitable only for the event-level. Last but not least, the technique requires the calculation of the temporal privacy loss for every individual within the data set which might prove computationally inefficient in real-time scenarios. + +% ON-OFF privacy with correlated requests +% Preserving ON-OFF Privacy for Past and Future Requests +% ON-OFF Privacy in the Presence of Correlation +% ON-OFF Privacy Against Correlation Over Time +% - microdata +% - infinite +% - streaming +% - dependence +% - event +% - ON-OFF privacy +% - randomization +% - serial (Markov chain with N states) +\hypertarget{naim2019off}{Naim et al.}~\cite{naim2019off, ye2019preserving, ye2020off, ye2021off} proposed the notion of \emph{ON-OFF privacy} according to which, users require privacy protection only at certain timestamps over time. +They investigate the privacy risk due to the correlation between a user's requests when toggling the privacy protection ON and OFF. +The goal is to minimize the information throughput and always answer users' requests while protecting their requests to online services when privacy is set to ON. +They model the dependence between requests using a Markov chain, which is publicly known, where each state represents an available service. +Setting privacy to ON, the user obfuscates their original query by randomly sending requests to (and receiving answers from) a subset of all of the available services. +Although this randomization step makes the original query indistinguishable while making sure that the users always get the information that they need, there is no clear quantification of the privacy guarantee that the scheme offers over time. diff --git a/text/related/statistical.tex b/text/related/statistical.tex index 480b57d..f80b080 100644 --- a/text/related/statistical.tex +++ b/text/related/statistical.tex @@ -354,3 +354,38 @@ The Perturber consumes the incoming data stream, adds noise $\varepsilon_p$, whi The data-adaptive Grouper consumes the original stream and partitions the data into well-approximated regions using, also part of the available privacy budget, $\varepsilon_g$. Finally, a query specific Smoother combines the independent information produced by the Perturber and the Grouper, and performs post-processing by calculating the final estimates of the Perturber's values for each partition created by the Grouper at each timestamp. The combination of the Perturber and the Grouper follows the sequential composition and post-processing properties of differential privacy, thus, the resulting algorithm satisfies ($\varepsilon_p + \varepsilon_g$)-differential privacy. + +% Temporally Discounted Differential Privacy for Evolving Datasets on an Infinite Horizon +% - statistical +% - infinite +% - streaming +% - linkage +% - - +% - differential privacy +% - perturbation (Laplace) +\hypertarget{farokhi2020temporally}{Farokhi}~\cite{farokhi2020temporally} proposed a relaxation of the user-level protection of differential privacy based on the discounted utility theory in economics. +More specifically, at each timestamp, the scheme of \emph{temporally discounted differential privacy} assigns different weights to the privacy budgets that have been invested in previous timestamps. +These weights decrease the further that we observe in the past. +The author implements an exponentially and a hyperbolic discounted scheme. +In the former, the discount factor, which is positive and less than $1$, and in the latter, the discounting coefficient, which is greater or equal to $0$, allows the adjustment of temporal discounting. +Increasing the discount factor offers stronger privacy protection, equivalent to that of user-level. +Whereas, increasing the discount coefficient resembles the behavior of event-level differential privacy. +Selecting a suitable value for the privacy budget and the discount parameter allows for bounding the overall privacy loss in an infinite observation scenario. +However, the assumption that all users discount previous data releases limits the applicability of the the current scheme in real-world scenarios for statistical data. + +% Real-Time Privacy-Preserving Data Release Over Vehicle Trajectory +% - statistical +% - infinite +% - streaming +% - linkage +% - global +% - w-event +% - differential privacy +% - perturbation (Laplace) +\hypertarget{ma2019real}{Ma et al.}~\cite{ma2019real} implemented \emph{RPTR}, a $w$-event differential privacy mechanism for protecting statistics of vehicular trajectory data in real time. +RPTR adapts the rate with which it samples data according to the accuracy with which it can predict future statistics based on historical data and position transfer probability matrix and according to how much the original data change through time based on Pearson coefficient. +Before releasing data statistics, the mechanism perturbs the original values with Laplacian noise the impact of which is mitigated by using Ensemble Kalman filtering. +The combination of adaptive sampling and filtering can improve the accuracy when predicting the values of non-sampled data points, and thus saving more privacy budget (i.e.,~higher data utility) for data points that the mechanism decides to release. +The mechanism detects highly frequented map regions and, using a quad-tree, it calculate the each region's privacy weight. +In their implementation, the authors assume that highly frequented regions tend to be more privacy sensitive, and thus more noise (i.e.,~less privacy budget to invest) needs to be introduced before publicly releasing the users' data falling into these regions. +The efficiency (both in terms of user privacy and data utility) of the mechanism depends on the number of regions that it divides the map, and therefore the challenge of its optimal division is an interesting future research topic.