the-last-thing/code/parse_copenhagen.py

187 lines
5.4 KiB
Python
Raw Permalink Normal View History

2021-09-22 21:44:38 +02:00
#!/usr/bin/env python3
import sys
sys.path.insert(1, 'lib')
import argparse
import ast
2021-09-22 21:44:38 +02:00
import csv
from datetime import datetime
from geopy.distance import distance
import io
import itertools
2021-09-22 21:44:38 +02:00
import lmdk_lib
import numpy as np
import os
import matplotlib.pyplot as plt
import time
import zipfile
2021-09-23 19:52:53 +02:00
'''
2021-10-01 21:30:32 +02:00
Suitable in first 1000 contacts.
usr n
2021-10-01 21:30:32 +02:00
449 12167
550 4221
689 3228
705 5
2021-09-23 19:52:53 +02:00
'''
2021-09-22 21:44:38 +02:00
# https://cloud.delkappa.com/s/ACMsDr2jnW3b6Np
# Copenhagen data format
# Header size
hdr = 1
# Timestamp
tim = 0
# User ID A
uid_a = 1
# User ID B
uid_b = 2
# Received Signal Strength Indicator (RSSI)
rssi = 3
def main(args):
'''
Load data
'''
2021-09-22 22:15:10 +02:00
# Get contacts from previous parsing
cont_data = lmdk_lib.load_data(args, 'cont')
if cont_data.size == 0:
# Contacts [tim, uid_a, uid_b, rssi]
cont = []
try:
print('Extracting %s... ' %(os.path.abspath(args.arc)), end='', flush=True)
with zipfile.ZipFile(args.arc, 'r') as arc:
print('[OK]')
with io.TextIOWrapper(arc.open(args.cont), newline='\n') as dat:
try:
print('Finding contacts... ', end='', flush=True)
# Get the contacts by skipping the header
cont_l = list(csv.reader(dat, delimiter=','))[hdr:]
# Check each contact
for c in cont_l:
if c[uid_b] != '-1' and c[rssi] != '0' and c[uid_b] != '-2' and c[uid_a] != c[uid_b]:
# Add valid contact
2021-09-22 22:15:10 +02:00
cont.append([c[tim], c[uid_a], c[uid_b], c[rssi]])
print('[OK]')
except Exception as e:
print('[Error: %s]' %(e))
except Exception as e:
print('[Error: %s]' %(e))
2021-09-29 12:49:12 +02:00
# Remove duplicates
cont_data = np.unique(np.array(cont, np.float32), axis=0)
# Save to results
2021-09-29 12:49:12 +02:00
lmdk_lib.save_data(args, cont_data, 'cont')
'''
Get users' landmarks
'''
# Get all users
usrs = np.unique(cont_data[:, uid_a])
# Check each user
2021-09-23 12:31:20 +02:00
goal = [.2, .4, .6, .8]
# Get users' data from previous parsing
2021-10-01 21:30:32 +02:00
usrs_data = lmdk_lib.load_data(args, 'usrs_data')
if usrs_data.size == 0:
# Users suitable for experiments
usrs_expt = []
for usr_i, usr in enumerate(usrs):
print('Checking %d (%d%%: %d/%d)... ' %(usr, (usr_i + 1)*100/len(usrs), usr_i + 1, len(usrs)), end='', flush=True)
usrs_expt_cur = []
2021-10-01 21:30:32 +02:00
# User's first 1000 contacts
usr_cont = cont_data[cont_data[:, uid_a] == usr][:1000]
# For each goal
for_expt = True
for g in goal:
if for_expt:
2021-09-28 22:40:21 +02:00
# User's contacts
conts = np.unique(usr_cont[:, uid_b])
# Possible contacts
pos_cont = []
# Check for every possible contact
2021-09-28 22:40:21 +02:00
for c in conts:
# Add possible contacts gradually
2021-09-28 22:40:21 +02:00
pos_cont.append(int(c))
# Remove from user contacts
usr_cont_cur = np.copy(usr_cont)
for pos_c in pos_cont:
2021-09-28 22:40:21 +02:00
usr_cont_cur = usr_cont_cur[usr_cont_cur[:, uid_b] != pos_c]
# Compare the difference
diff = (len(usr_cont) - len(usr_cont_cur))/len(usr_cont)
# Check if it is close enough to what we need
if abs(diff - g)/g < .01:
usrs_expt_cur.append([int(usr), g, str(pos_cont)])
if g == goal[len(goal) - 1]:
# That's a keeper
print('[OK]')
usrs_expt += usrs_expt_cur
break
elif diff > g:
2021-09-28 22:40:21 +02:00
print('[%.2f]' %(diff))
for_expt = False
break
# Save to results
2021-10-01 21:30:32 +02:00
usrs_data = np.array(usrs_expt, str)
lmdk_lib.save_data(args, usrs_data, 'usrs_data')
2021-09-29 12:49:12 +02:00
# Get all users
2021-10-01 21:30:32 +02:00
usrs = np.unique(usrs_data[:, 0])
2021-09-29 12:49:12 +02:00
for usr in usrs:
usr_cont = cont_data[cont_data[:, uid_a] == float(usr)]
print(int(usr), len(usr_cont))
exit()
# '''
# Get contacts for user 623
# 9378 contacts
# '''
# usr = '623'
# # All user contacts
# usr_cont = cont_data[cont_data[:, uid_a] == float(usr)]
# # All user landmarks for different goals
2021-10-01 21:30:32 +02:00
# usr_lmdk = usrs_data[usrs_data[:, 0] == usr]
2021-09-29 12:49:12 +02:00
# for g in goal:
# # Get goal landmarks
# cont = ast.literal_eval(usr_lmdk[usr_lmdk[:, 1] == str(g)][0][2])
# usr_cont_cur = np.copy(usr_cont)
# # Remove goal landmarks from contacts
# for c in cont:
# usr_cont_cur = usr_cont_cur[usr_cont_cur[:, uid_b] != c]
# # Check
# print(g, (len(usr_cont) - len(usr_cont_cur))/len(usr_cont))
2021-09-22 21:44:38 +02:00
'''
Parse arguments.
Optional:
arc - The data archive file.
cont - The contacts data file.
res - The results archive file.
'''
def parse_args():
# Create argument parser.
parser = argparse.ArgumentParser()
# Mandatory arguments.
# Optional arguments.
parser.add_argument('-a', '--arc', help='The data archive file.', type=str, default='/home/manos/Cloud/Data/Copenhagen/Data.zip')
parser.add_argument('-c', '--cont', help='The contacts data file.', type=str, default='bt_symmetric.csv')
parser.add_argument('-r', '--res', help='The results archive file.', type=str, default='/home/manos/Cloud/Data/Copenhagen/Results.zip')
# Parse arguments.
args = parser.parse_args()
return args
if __name__ == '__main__':
try:
start_time = time.time()
main(parse_args())
end_time = time.time()
print('##############################')
print('Time : %.4fs' % (end_time - start_time))
print('##############################')
except KeyboardInterrupt:
print('Interrupted by user.')
exit()