evaluation: Added details
This commit is contained in:
		@ -1251,6 +1251,12 @@
 | 
				
			|||||||
  publisher = {ACM}
 | 
					  publisher = {ACM}
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@techreport{makonin2018hue,
 | 
				
			||||||
 | 
					  title  = {HUE: The hourly usage of energy dataset for buildings in British Columbia},
 | 
				
			||||||
 | 
					  author = {Makonin, Stephen},
 | 
				
			||||||
 | 
					  year   = {2018}
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@article{matyas1990visual,
 | 
					@article{matyas1990visual,
 | 
				
			||||||
  title     = {Visual analysis of single-case time series: Effects of variability, serial dependence, and magnitude of intervention effects},
 | 
					  title     = {Visual analysis of single-case time series: Effects of variability, serial dependence, and magnitude of intervention effects},
 | 
				
			||||||
  author    = {Matyas, Thomas A and Greenwood, Kenneth M},
 | 
					  author    = {Matyas, Thomas A and Greenwood, Kenneth M},
 | 
				
			||||||
@ -1478,6 +1484,17 @@
 | 
				
			|||||||
  organization = {IEEE}
 | 
					  organization = {IEEE}
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@article{sapiezynski2019interaction,
 | 
				
			||||||
 | 
					  title     = {Interaction data from the copenhagen networks study},
 | 
				
			||||||
 | 
					  author    = {Sapiezynski, Piotr and Stopczynski, Arkadiusz and Lassen, David Dreyer and Lehmann, Sune},
 | 
				
			||||||
 | 
					  journal   = {Scientific Data},
 | 
				
			||||||
 | 
					  volume    = {6},
 | 
				
			||||||
 | 
					  number    = {1},
 | 
				
			||||||
 | 
					  pages     = {1--10},
 | 
				
			||||||
 | 
					  year      = {2019},
 | 
				
			||||||
 | 
					  publisher = {Nature Publishing Group}
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@article{satyanarayanan2017emergence,
 | 
					@article{satyanarayanan2017emergence,
 | 
				
			||||||
  title     = {The emergence of edge computing},
 | 
					  title     = {The emergence of edge computing},
 | 
				
			||||||
  author    = {Satyanarayanan, Mahadev},
 | 
					  author    = {Satyanarayanan, Mahadev},
 | 
				
			||||||
@ -1736,6 +1753,14 @@
 | 
				
			|||||||
  publisher = {Elsevier}
 | 
					  publisher = {Elsevier}
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@inproceedings{wang2017locally,
 | 
				
			||||||
 | 
					  title     = {Locally differentially private protocols for frequency estimation},
 | 
				
			||||||
 | 
					  author    = {Wang, Tianhao and Blocki, Jeremiah and Li, Ninghui and Jha, Somesh},
 | 
				
			||||||
 | 
					  booktitle = {26th $\{$USENIX$\}$ Security Symposium ($\{$USENIX$\}$ Security 17)},
 | 
				
			||||||
 | 
					  pages     = {729--745},
 | 
				
			||||||
 | 
					  year      = {2017}
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@inproceedings{wang2017privacy,
 | 
					@inproceedings{wang2017privacy,
 | 
				
			||||||
  title        = {Privacy Preserving Anonymity for Periodical SRS Data Publishing},
 | 
					  title        = {Privacy Preserving Anonymity for Periodical SRS Data Publishing},
 | 
				
			||||||
  author       = {Wang, Jie-Teng and Lin, Wen-Yang},
 | 
					  author       = {Wang, Jie-Teng and Lin, Wen-Yang},
 | 
				
			||||||
 | 
				
			|||||||
							
								
								
									
										96
									
								
								text/evaluation/details.tex
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										96
									
								
								text/evaluation/details.tex
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,96 @@
 | 
				
			|||||||
 | 
					\section{Details}
 | 
				
			||||||
 | 
					\label{sec:eval-dtl}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					In this section we list all the relevant details regarding the setting of the evaluation (Section~\ref{subsec:eval-setup}), and the real and synthetic data sets that we used(Section~\ref{subsec:eval-dat}), along with the corresponding configurations (Section~\ref{subsec:eval-conf}).
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					\subsection{Setting}
 | 
				
			||||||
 | 
					\label{subsec:eval-setup}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					We implemented our experiments\footnote{Code available at \url{https://git.delkappa.com/manos/the-last-thing}} in Python $3$.$9$.$7$ and executed them on a machine with an Intel i$7$-$6700$HQ at $3$.$5$GHz CPU and $16$GB RAM, running Manjaro Linux $21$.$1$.$5$.
 | 
				
			||||||
 | 
					We repeated each experiment $100$ times and we report the mean over these iterations.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					\subsection{Data sets}
 | 
				
			||||||
 | 
					\label{subsec:eval-dat}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					\subsubsection{Real}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					\paragraph{Copenhagen}~\cite{sapiezynski2019interaction}
 | 
				
			||||||
 | 
					data set that was collected via the smartphone devices of $851$ university students over a period of $4$ week as part of the Copenhagen Networks Study.
 | 
				
			||||||
 | 
					Each device was configured to be discoverable by and to discover nearby Bluetooth devices every $5$ minutes.
 | 
				
			||||||
 | 
					Upon discovery each device registers, (i)~the timestamp in seconds, (ii)~the device's unique identifier, (iii)~the unique identifier of the device that it discovered ($- 1$ when no device was found or $- 2$ for any non-participating device), and (iv)~the Received Signal Strength Indicator (RSSI) in dBm.
 | 
				
			||||||
 | 
					Half of the devices have registered data at at least $81\%$ of the possible timestamps.
 | 
				
			||||||
 | 
					From this data set, we utilized the $1,000$ first contacts out of $12,167$ valid unique contacts of the device with identifier `$449$'.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					\paragraph{HUE}~\cite{makonin2018hue}
 | 
				
			||||||
 | 
					contains the hourly energy consumption data of $22$ residential customers of BCHydro, a provincial power utility, in British Columbia.
 | 
				
			||||||
 | 
					The measurements for each residence are saved individually and each measurement contains (i)~the date (YYYY-MM-DD), (ii)~the hour, and (iii)~the energy consumption in kWh.
 | 
				
			||||||
 | 
					In our experiments, we used the first $1,000$ out of $29,231$ measurements of the residence with identifier `$1$', average energy consumption equal to $0.88$kWh, and value range $[0.28$, $4.45]$.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					\paragraph{T-drive}~\cite{yuan2010t}
 | 
				
			||||||
 | 
					consists of $15$ million GPS data points of the trajectories of $10,357$ taxis in Beijing, spanning a period of $1$ week and a total distance of $9$ million kilometers.
 | 
				
			||||||
 | 
					The taxis reported their location data on average every $177$ seconds and $623$ meters approximately.
 | 
				
			||||||
 | 
					Each vehicle registers (i)~the taxi unique identifier, (ii)~the timestamp (YYYY-MM-DD HH:MM:SS), (iii)~longitude, and (iv)~latitude.
 | 
				
			||||||
 | 
					These measurements are stored individually per vehicle.
 | 
				
			||||||
 | 
					We sampled the first $1000$ data items of the taxi with identifier `$2$'.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					\subsubsection{Synthetic}
 | 
				
			||||||
 | 
					We generated synthetic time series of length equal to $100$ timestamps, for which we varied the number and distribution of {\thethings}.
 | 
				
			||||||
 | 
					We take into account only the temporal order of the points and the position of regular and {\thething} events within the series. 
 | 
				
			||||||
 | 
					% Note, that for the experiments performed on the synthetic data sets, the original values to be released do not influence the outcome of our conclusions, thus we ignore them.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					\subsection{Configurations}
 | 
				
			||||||
 | 
					\label{subsec:eval-conf}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					\subsubsection{{\Thethings}' percentage}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					For Copenhagen, we achieve 
 | 
				
			||||||
 | 
					$0\%$ {\thethings} by considering an empty list of contact devices,
 | 
				
			||||||
 | 
					$20\%$ by extending the list with $[3$, $6$, $11$, $12$, $25$, $29$, $36$, $39$, $41$, $46$, $47$, $50$, $52$, $56$, $57$, $61$, $63$, $78$, $80]$, 
 | 
				
			||||||
 | 
					$40\%$ with $[81$, $88$, $90$, $97$, $101$, $128$, $130$, $131$, $137$, $145$, $146$, $148$, $151$, $158$, $166$, $175$, $176]$, 
 | 
				
			||||||
 | 
					$60\%$ with $[181$, $182$, $192$, $195$, $196$, $201$, $203$, $207$, $221$, $230$, $235$, $237$, $239$, $241$, $254]$, 
 | 
				
			||||||
 | 
					$80\%$ with $[260$, $282$, $287$, $289$, $290$, $291$, $308$, $311$, $318$, $323$, $324$, $330$, $334$, $335$, $344$, $350$, $353$, $355$, $357$, $358$, $361$, $363]$, and 
 | 
				
			||||||
 | 
					$100\%$ by including all of the possible contacts.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					In HUE, we get $0$, $20$ $40$, $60$, $80$, and $100$ {\thethings} percentages by setting the energy consumption threshold below $0.28$, $1.12$, $0.88$, $0.68$, $0.54$, $4.45$kWh respectively.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					In T-drive, we achieved the desired {\thethings} percentages by utilizing the method of Li et al.~\cite{li2008mining} for detecting stay points in trajectory data.
 | 
				
			||||||
 | 
					In more detail, the algorithm checks for each data item if each subsequent item is within a given distance threshold $\Delta l$ and measures the time period $\Delta t$ between the present point and the last subsequent point.
 | 
				
			||||||
 | 
					We achieve $0$, $20$ $40$, $60$, $80$, and $100$ {\thethings} percentages by setting the ($\Delta l$ in meters, $\Delta t$ in minutes) pairs input to the stay point discovery method as [($0$, $1000$), ($2095$, $30$), ($2790$, $30$), ($3590$, $30$), ($4825$, $30$), ($10350$, $30$)].
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					We generated synthetic data with \emph{skewed} (the {\thethings} are distributed towards the beginning/end of the series), \emph{symmetric} (in the middle), \emph{bimodal} (both end and beginning), and \emph{uniform} (all over the time series) {\thething} distributions.
 | 
				
			||||||
 | 
					In order to get {\thethings} with the above distribution features, we generate probability distributions with appropriate characteristics and sample from them, without replacement, the desired number of points.
 | 
				
			||||||
 | 
					%The generated distributions are representative of the cases that we wish to examine during the experiments.
 | 
				
			||||||
 | 
					For example, for a left-skewed {\thethings} distribution we would utilize a truncated distribution resulting from the restriction of the domain of a distribution to the beginning and end of the time series with its location shifted to the center of the right half of the series.
 | 
				
			||||||
 | 
					For consistency, we calculate the scale parameter depending on the length of the series by setting it equal to the series' length over a constant.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					\subsubsection{Temporal correlation}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					We model the temporal correlation in the synthetic data as a \emph{stochastic matrix} $P$, using a \emph{Markov Chain}~\cite{gagniuc2017markov}.
 | 
				
			||||||
 | 
					$P$ is a $n \times n$ matrix, where the element $P_{ij}$
 | 
				
			||||||
 | 
					%at the $i$th row of the $j$th column that 
 | 
				
			||||||
 | 
					represents the transition probability from a state $i$ to another state $j$.
 | 
				
			||||||
 | 
					%, $\forall i, j \leq n$.
 | 
				
			||||||
 | 
					It holds that the elements of every row $j$ of $P$ sum up to $1$.
 | 
				
			||||||
 | 
					We follow the \emph{Laplacian smoothing} technique~\cite{sorkine2004laplacian} as utilized in~\cite{cao2018quantifying} to generate the matrix $P$ with a degree of temporal correlation $s > 0$ equal to
 | 
				
			||||||
 | 
					% and generate a stochastic matrix $P$ with a degree of temporal correlation $s$ by calculating each element $P_{ij}$ as follows
 | 
				
			||||||
 | 
					$$\frac{(I_{n})_{ij} + s}{\sum_{k = 1}^{n}((I_{n})_{jk} + s)}$$
 | 
				
			||||||
 | 
					where $I_{n}$ is an \emph{identity matrix} of size $n$.
 | 
				
			||||||
 | 
					%, i.e.,~an $n \times n$ matrix with $1$s on its main diagonal and $0$s elsewhere.
 | 
				
			||||||
 | 
					% $s$ takes only positive values which are comparable only for stochastic matrices of the same size.
 | 
				
			||||||
 | 
					The value of $s$ is comparable only for stochastic matrices of the same size and dictates the strength of the correlation; the lower its value, 
 | 
				
			||||||
 | 
					% the lower the degree of uniformity of each row, and therefore 
 | 
				
			||||||
 | 
					the stronger the correlation degree.
 | 
				
			||||||
 | 
					%In general, larger transition matrices tend to be uniform, resulting in weaker correlation.
 | 
				
			||||||
 | 
					In our experiments, for simplicity, we set $n = 2$ and we investigate the effect of \emph{weak} ($s = 1$), \emph{moderate} ($s = 0.1$), and \emph{strong} ($s = 0.01$) temporal correlation degree on the overall privacy loss.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					\subsubsection{Privacy parameters}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					To perturb the contact tracing data of Copenhagen, we utilize the \emph{random response} technique to report with probability $p = \frac{e^\varepsilon}{e^\varepsilon + 1}$ weather the current contact is a {\thething} or not.
 | 
				
			||||||
 | 
					We randomize them the energy consumption in HUE with the Laplace mechanism.
 | 
				
			||||||
 | 
					To perturb the spatial values in T-drive, we inject noise that we sample from the Planar Laplace mechanism~\cite{andres2013geo}.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					We set the privacy budget $\varepsilon = 1$, and, for simplicity, we assume that for every query sensitivity it holds that $\Delta f = 1$. 
 | 
				
			||||||
 | 
					% Finally, notice that, depending on the results' variation, most diagrams are in logarithmic scale.
 | 
				
			||||||
@ -1,5 +1,6 @@
 | 
				
			|||||||
\chapter{Evaluation}
 | 
					\chapter{Evaluation}
 | 
				
			||||||
\label{ch:eval}
 | 
					\label{ch:eval}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					\input{evaluation/details}
 | 
				
			||||||
\input{evaluation/thething}
 | 
					\input{evaluation/thething}
 | 
				
			||||||
\input{evaluation/theotherthing}
 | 
					\input{evaluation/theotherthing}
 | 
				
			||||||
 | 
				
			|||||||
@ -13,56 +13,6 @@ This happens due the fact that at each timestamp we take into account only the d
 | 
				
			|||||||
Whereas, when each timestamp corresponds to a {\thething} we consider and protect all the events throughout the entire series (user-level).
 | 
					Whereas, when each timestamp corresponds to a {\thething} we consider and protect all the events throughout the entire series (user-level).
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
\subsection{Setting, configurations, and data sets}
 | 
					 | 
				
			||||||
\paragraph{Setting}
 | 
					 | 
				
			||||||
We implemented our experiments\footnote{Code available at \url{https://gitlab.com/adhesivegoldfinch/cikm}} in Python $3$.$9$.$5$ and executed them on a machine with Intel i$7$-$6700$HQ $3$.$5$GHz CPU and $16$GB RAM, running Manjaro $21$.$0$.$5$.
 | 
					 | 
				
			||||||
We repeated each experiment $100$ times and we report the mean over these iterations.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
\paragraph{Data sets}
 | 
					 | 
				
			||||||
For the \emph{real} data sets, we used the Geolife~\cite{zheng2010geolife} and T-drive~\cite{yuan2010t} from which we sampled the first $1000$ data items.
 | 
					 | 
				
			||||||
We achieved the desired {\thethings} percentages by utilizing the method of Li et al.~\cite{li2008mining} for detecting stay points in trajectory data.
 | 
					 | 
				
			||||||
In more detail, the algorithm checks for each data item if each subsequent item is within a given distance threshold $\Delta l$ and measures the time period $\Delta t$ between the present point and the last subsequent point.
 | 
					 | 
				
			||||||
We achieve $0$, $20$ $40$, $60$, $80$, and $100$ {\thethings} percentages by setting the ($\Delta l$ in meters, $\Delta t$ in minutes) pairs input to the stay point discovery method for T-drive as [($0$, $1000$), ($2095$, $30$), ($2790$, $30$), ($3590$, $30$), ($4825$, $30$), ($10350$, $30$)] and for Geolife as [($0$, $100000$), ($205$, $30$), ($450$, $30$), ($725$, $30$), ($855$, $30$), ($50000$, $30$)].
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
Next, we generated synthetic time series of length equal to $100$ timestamps, for which we varied the number and distribution of {\thethings}.
 | 
					 | 
				
			||||||
% to achieve the necessary {\thethings} distribution and percentage for where applicable.
 | 
					 | 
				
			||||||
% \paragraph{{\Thethings} distribution}
 | 
					 | 
				
			||||||
We created \emph{left-skewed} (the {\thethings} are distributed towards the end), \emph{symmetric} (in the middle), \emph{right-skewed} (in the beginning), \emph{bimodal} (both end and beginning), and \emph{uniform} (all over the time series) {\thething} distributions.
 | 
					 | 
				
			||||||
%, in the beginning and in the end (\emph{bimodal}), and all over the extend (\emph{uniform}) of a time series.
 | 
					 | 
				
			||||||
When pertinent, we group the left- and right-skewed cases as simply `skewed', since they share several features due to symmetry.
 | 
					 | 
				
			||||||
In order to get {\thethings} with the above distribution features, we generate probability distributions with appropriate characteristics and sample from them, without replacement, the desired number of points.
 | 
					 | 
				
			||||||
%The generated distributions are representative of the cases that we wish to examine during the experiments.
 | 
					 | 
				
			||||||
% For example, for a left-skewed {\thethings} distribution we would utilize a truncated distribution resulting from the restriction of the domain of a normal distribution to the beginning and end of the time series with its location shifted to the center of the right half of the series.
 | 
					 | 
				
			||||||
For consistency, we calculate the scale parameter depending on the length of the series by setting it equal to the series' length over a constant.
 | 
					 | 
				
			||||||
%We take into account only the temporal order of the points and the position of regular and {\thething} events within the series. 
 | 
					 | 
				
			||||||
Note, that for the experiments performed on the synthetic data sets, the original values to be released do not influence the outcome of our conclusions, thus we ignore them.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
\paragraph{Configurations}
 | 
					 | 
				
			||||||
We model the temporal correlation in the synthetic data as a \emph{stochastic matrix} $P$, using a \emph{Markov Chain}~\cite{gagniuc2017markov}.
 | 
					 | 
				
			||||||
$P$ is a $n \times n$ matrix, where the element $p_{ij}$
 | 
					 | 
				
			||||||
%at the $i$th row of the $j$th column that 
 | 
					 | 
				
			||||||
represents the transition probability from a state $i$ to another state $j$.
 | 
					 | 
				
			||||||
%, $\forall i, j \leq n$.
 | 
					 | 
				
			||||||
It holds that the elements of every row $j$ of $P$ sum up to $1$.
 | 
					 | 
				
			||||||
We follow the \emph{Laplacian smoothing} technique~\cite{sorkine2004laplacian} as utilized in~\cite{cao2018quantifying} to generate the matrix $P$ with a degree of temporal correlation $s>0$.
 | 
					 | 
				
			||||||
% and generate a stochastic matrix $P$ with a degree of temporal correlation $s$ by calculating each element $P_{ij}$ as follows
 | 
					 | 
				
			||||||
%$$\frac{(I_{n})_{ij} + s}{\sum_{k = 1}^{n}((I_{n})_{jk} + s)}$$
 | 
					 | 
				
			||||||
%where $I_{n}$ is an \emph{identity matrix} of size $n$.
 | 
					 | 
				
			||||||
%, i.e.,~an $n \times n$ matrix with $1$s on its main diagonal and $0$s elsewhere.
 | 
					 | 
				
			||||||
% $s$ takes only positive values which are comparable only for stochastic matrices of the same size.
 | 
					 | 
				
			||||||
$s$ dictates the strength of the correlation; the lower its value, 
 | 
					 | 
				
			||||||
%the lower the degree of uniformity of each row, and therefore 
 | 
					 | 
				
			||||||
the stronger the correlation degree.
 | 
					 | 
				
			||||||
%In general, larger transition matrices tend to be uniform, resulting in weaker correlation.
 | 
					 | 
				
			||||||
In our experiments, for simplicity, we set $n = 2$ and we investigate the effect of \emph{weak} ($s = 1$), \emph{moderate} ($s = 0.1$), and \emph{strong} ($s = 0.01$) temporal correlation degree on the overall privacy loss.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
We set $\varepsilon = 1$.
 | 
					 | 
				
			||||||
To perturb the spatial values of the real data sets, we inject noise that we sample from the Planar Laplace mechanism~\cite{andres2013geo}. 
 | 
					 | 
				
			||||||
Finally, notice that all diagrams are in logarithmic scale.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
\subsection{Experiments}
 | 
					\subsection{Experiments}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
\paragraph{Budget allocation schemes}
 | 
					\paragraph{Budget allocation schemes}
 | 
				
			||||||
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user