aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNathan Perry <avaglir@gmail.com>2017-12-08 14:37:07 -0500
committerNathan Perry <avaglir@gmail.com>2017-12-08 14:37:07 -0500
commitfa549476445594e1feeb05b7782bb954e09f7580 (patch)
treede7e94ded6ce92011f7897d5ffcdab4324472ca2
parent821f882320ad5df5ba10681b322fc7b3b8fd25ca (diff)
reduce logging and add bad_set
-rw-r--r--.gitignore1
-rw-r--r--download_references.py83
2 files changed, 45 insertions, 39 deletions
diff --git a/.gitignore b/.gitignore
index 3f9358e..358b100 100644
--- a/.gitignore
+++ b/.gitignore
@@ -111,3 +111,4 @@ splits
refdata
logs
wiki.xml
+bad.txt
diff --git a/download_references.py b/download_references.py
index cb09552..0436a8a 100644
--- a/download_references.py
+++ b/download_references.py
@@ -5,6 +5,7 @@ import os
import unicodedata
from argparse import ArgumentParser
from concurrent.futures import ProcessPoolExecutor
+from functools import partial
from hashlib import md5
import requests
@@ -24,36 +25,45 @@ config = arg_parser.parse_args()
req_headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0'}
+bad_set = set()
+if os.path.exists('bad.txt'):
+ with open('bad.txt') as bad_f:
+ bad_set = {line.strip() for line in bad_f.readlines()}
-def check_ref(ref):
+
+def check_resp(requests_callable, ident):
# noinspection PyBroadException
try:
- resp = requests.head(ref, headers=req_headers, allow_redirects=True, timeout=1)
+ resp = requests_callable()
except requests.Timeout:
- logger.info('ref {} timed out'.format(ref))
- return False
+ logger.info('{} timed out'.format(ident))
+ return None
except requests.ConnectionError:
- logger.info('ref {} failed to connect'.format(ref))
- return False
+ logger.info('{} failed to connect'.format(ident))
+ return None
except:
- logger.exception('unexpected exception reading pdf head')
- return False
+ logger.exception('unexpected exception in request')
+ return None
if not (200 <= resp.status_code < 299):
- logger.info('ref {} status {}'.format(ref, resp.status_code))
- return False
+ logger.info('{} status {}'.format(ident, resp.status_code))
+ return None
cnt_type = resp.headers.get('content-type', '').lower()
if cnt_type != 'x-pdf' and cnt_type != 'application/pdf':
- logger.debug('ref {} ignored (not pdf)'.format(ref))
- return False
+ logger.debug('{} ignored (not pdf)'.format(ident))
+ return None
- return True
+ return resp
def download_references(filename):
+ local_bad_set = set()
+
logger.info('downloading references for {}'.format(filename))
+ _rsrcmgr = PDFResourceManager()
+
with open(filename) as f:
data = json.load(f)
@@ -65,43 +75,33 @@ def download_references(filename):
m.update(ref.encode('utf-8'))
digest = m.hexdigest()
+ if digest in local_bad_set or digest in bad_set:
+ logger.debug('ref {} ignored (known bad)'.format(ref))
+ continue
+
tgt_file = 'refdata/{}.txt'.format(digest)
if os.path.exists(tgt_file) and not config.overwrite:
logger.debug('ref {} ignored (file existed)'.format(ref))
continue
- if not check_ref(ref):
+ resp = check_resp(partial(requests.head, ref, headers=req_headers, allow_redirects=True, timeout=1),
+ 'ref {}'.format(ref))
+ if not resp:
+ local_bad_set.add(ref)
continue
logger.info('downloading ref {}'.format(ref))
- # noinspection PyBroadException
- try:
- resp = requests.get(ref, headers=req_headers, timeout=3, stream=True)
- except requests.Timeout:
- logger.info('ref {} timed out'.format(ref))
- continue
- except requests.ConnectionError:
- logger.info('ref {} failed to connect'.format(ref))
- continue
- except:
- logger.exception('unexpected exception reading pdf')
+ resp = check_resp(partial(requests.get, ref, headers=req_headers, timeout=3, stream=True),
+ 'ref {}'.format(ref))
+ if not resp:
+ local_bad_set.add(ref)
continue
- if not (200 <= resp.status_code < 299):
- logger.info('ref {} status {}').format(ref, resp.status_code)
- continue
-
- cnt_type = resp.headers.get('content-type', '').lower()
- if cnt_type != 'x-pdf' and cnt_type != 'application/pdf':
- logger.debug('ref {} ignored (not pdf)'.format(ref))
- continue
-
- logger.info('ref {} successfully downloaded'.format(ref))
+ logger.debug('ref {} successfully downloaded'.format(ref))
logger.debug('parsing ref {}'.format(ref))
- _rsrcmgr = PDFResourceManager()
pdfstr = io.StringIO()
_device = TextConverter(_rsrcmgr, pdfstr, codec='utf-8', laparams=LAParams())
pdfinterp = PDFPageInterpreter(_rsrcmgr, _device)
@@ -120,6 +120,11 @@ def download_references(filename):
logger.info('wrote text for {}'.format(ref))
+ with open('bad.txt', 'a') as f:
+ f.writelines('{}\n'.format(elem) for elem in local_bad_set.difference(bad_set))
+
+ bad_set.update(local_bad_set)
+
def main():
if not os.path.exists('refdata'):
@@ -133,16 +138,16 @@ def main():
logging.getLogger('urllib3').setLevel(logging.INFO)
root_logger = logging.getLogger()
- root_logger.setLevel(logging.DEBUG)
+ root_logger.setLevel(logging.INFO)
stream_handler = logging.StreamHandler()
stream_handler.setFormatter(logging.Formatter('%(name)s - %(levelname)s - %(message)s'))
- stream_handler.setLevel(logging.INFO)
+ # stream_handler.setLevel(logging.INFO)
file_handler = logging.FileHandler('logs/dl.log', mode='w')
file_handler.setFormatter(logging.Formatter('[%(asctime)s | %(name)s | %(levelname)s] %(message)s',
'%Y-%m-%d %H:%M:%S'))
- file_handler.setLevel(logging.DEBUG)
+ # file_handler.setLevel(logging.INFO)
root_logger.addHandler(stream_handler)
root_logger.addHandler(file_handler)