the-last-thing/code/parse_t-drive.py

133 lines
3.9 KiB
Python
Raw Normal View History

2021-10-09 04:00:36 +02:00
#!/usr/bin/env python3
import argparse
import csv
from datetime import datetime
from geopy.distance import distance
import io
import lmdk_lib
import numpy as np
import os
import matplotlib.pyplot as plt
import sys
import time
import zipfile
'''
############ Stats ###########
Trajectories: 10357
Length
Total : 17662984
Minimum :
Maximum :
Landmarks : 212816
Length
Total : 6851102 (27.54%)
Minimum : 1
Maximum : 6896
##############################
'''
# https://cloud.delkappa.com/s/2Rs2wjS8zmt5bAE
# T-drive data format
# User ID
uid = 0
# Coordinates
lat, lng = 3, 2
# Timestamp
tim = 1
# Timestamp format
tim_fmt = "%Y-%m-%d %H:%M:%S"
def main(args):
# Get users' landmarks from previous parsing
usrs_lmdks = lmdk_lib.load_data(args, 'usrs_lmdks')
if usrs_lmdks.size == 0:
# Get users' data from previous parsing
usrs_data = lmdk_lib.load_data(args, 'usrs_data')
if usrs_data.size == 0:
usrs_data = np.empty((0,4), np.float32)
# Parse users' data
try:
print('Extracting %s... ' %(os.path.abspath(args.arc)), end='', flush=True)
with zipfile.ZipFile(args.arc, 'r') as arc:
print('[OK]')
# Get the list of users
print('Getting users... ', end='', flush=True)
# List of users
usrs = list(info.filename.split('/')[2].split('.')[0] for info in arc.infolist() if '.txt' in info.filename)
# Sort users numerically
usrs.sort(key=int)
print('[OK]')
for usr in usrs:
points = []
traj_file = 'release/taxi_log_2008_by_id/' + usr + '.txt'
print('[%d%% (%s/%d)] Loading data from %s... ' %((int(usr)/len(usrs))*100, usr, len(usrs), traj_file), end='', flush=True)
with io.TextIOWrapper(arc.open(traj_file), newline='\n') as dat:
try:
# df = pd.read_csv(dat, sep=',', index_col=None, names=data_cols)
traj = csv.reader(dat, delimiter=',')
for p in traj:
points += [[p[uid], p[lat], p[lng], datetime.strptime(p[tim], tim_fmt).timestamp()]]
print('[OK]')
except Exception as e:
print('[Error: %s]' %(e))
if points:
usrs_data = np.append(usrs_data, np.asarray(points, dtype=np.float32), axis=0)
# Save to results
lmdk_lib.save_data(args, usrs_data, 'usrs_data')
except Exception as e:
print('[Error: %s]' %(e))
# Find users' landmarks
usrs_lmdks = lmdk_lib.find_lmdks(usrs_data, args)
# Save to results
lmdk_lib.save_data(args, usrs_lmdks, 'usrs_lmdks')
# Landmarks' stats
lmdk_lib.lmdks_stats(args, usrs_lmdks)
'''
Parse arguments.
Optional:
arc - The archive file.
dist - The coordinates distance threshold in meters.
per - The timestaps period threshold in mimutes.
res - The results zip file.
'''
def parse_args():
# Create argument parser.
parser = argparse.ArgumentParser()
# Mandatory arguments.
# Optional arguments.
parser.add_argument('-a', '--arc', help='The data archive file.', type=str, default='/home/manos/Cloud/Data/T-drive/Data.zip')
parser.add_argument('-l', '--dist', help='The coordinates distance threshold in meters.', type=int, default=200)
parser.add_argument('-p', '--per', help='The timestaps period threshold in mimutes.', type=int, default=30)
parser.add_argument('-r', '--res', help='The results archive file.', type=str, default='/home/manos/Cloud/Data/T-drive/Results.zip')
# Parse arguments.
args = parser.parse_args()
return args
if __name__ == '__main__':
try:
start_time = time.time()
main(parse_args())
end_time = time.time()
print('##############################')
print('Time : %.4fs' % (end_time - start_time))
print('##############################')
except KeyboardInterrupt:
print('Interrupted by user.')
exit()