-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcrawler.py
More file actions
30 lines (27 loc) · 1.14 KB
/
Copy pathcrawler.py
File metadata and controls
30 lines (27 loc) · 1.14 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
import time
import subprocess
maxtweets=15000
t_format = '%Y-%m-%d'
start_date = '2008-12-01'
end_date = '2017-12-31'
base_cmd = 'python2 Exporter.py --querysearch "#timessquare" --since {} --until {} --maxtweets {} --output datasets/{}_output_got.csv'
start = time.mktime(time.strptime(start_date, t_format))
end = time.mktime(time.strptime(end_date, t_format))
step = 3 * 30 * 24 * 3600
to = start + step
metric_s = time.time()
with open('fail.log', 'a') as f:
while start < end:
start_date = time.strftime(t_format, time.localtime(start))
to_date = time.strftime(t_format, time.localtime(to))
print('Processing: ' + start_date + '_' + to_date)
cmd = base_cmd.format(start_date, to_date, maxtweets, start_date + '_' + to_date)
try:
subprocess.check_output(cmd.split(), stderr=subprocess.STDOUT)
except subprocess.CalledProcessError as e:
f.write('Crawling failed! start={}, end={}, error={}\n'.format(start_date, to_date, e.output))
f.flush()
start = to
to += step
metric_e = time.time()
print('Consume time {} hours.'.format((metric_e - metric_s) / 3600.0))