diff options
author | Nathan Perry <avaglir@gmail.com> | 2017-12-08 14:37:07 -0500 |
---|---|---|
committer | Nathan Perry <avaglir@gmail.com> | 2017-12-08 14:37:07 -0500 |
commit | fa549476445594e1feeb05b7782bb954e09f7580 (patch) | |
tree | de7e94ded6ce92011f7897d5ffcdab4324472ca2 | |
parent | 821f882320ad5df5ba10681b322fc7b3b8fd25ca (diff) |
reduce logging and add bad_set
-rw-r--r-- | .gitignore | 1 | ||||
-rw-r--r-- | download_references.py | 83 |
2 files changed, 45 insertions, 39 deletions
@@ -111,3 +111,4 @@ splits refdata logs wiki.xml +bad.txt diff --git a/download_references.py b/download_references.py index cb09552..0436a8a 100644 --- a/download_references.py +++ b/download_references.py @@ -5,6 +5,7 @@ import os import unicodedata from argparse import ArgumentParser from concurrent.futures import ProcessPoolExecutor +from functools import partial from hashlib import md5 import requests @@ -24,36 +25,45 @@ config = arg_parser.parse_args() req_headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0'} +bad_set = set() +if os.path.exists('bad.txt'): + with open('bad.txt') as bad_f: + bad_set = {line.strip() for line in bad_f.readlines()} -def check_ref(ref): + +def check_resp(requests_callable, ident): # noinspection PyBroadException try: - resp = requests.head(ref, headers=req_headers, allow_redirects=True, timeout=1) + resp = requests_callable() except requests.Timeout: - logger.info('ref {} timed out'.format(ref)) - return False + logger.info('{} timed out'.format(ident)) + return None except requests.ConnectionError: - logger.info('ref {} failed to connect'.format(ref)) - return False + logger.info('{} failed to connect'.format(ident)) + return None except: - logger.exception('unexpected exception reading pdf head') - return False + logger.exception('unexpected exception in request') + return None if not (200 <= resp.status_code < 299): - logger.info('ref {} status {}'.format(ref, resp.status_code)) - return False + logger.info('{} status {}'.format(ident, resp.status_code)) + return None cnt_type = resp.headers.get('content-type', '').lower() if cnt_type != 'x-pdf' and cnt_type != 'application/pdf': - logger.debug('ref {} ignored (not pdf)'.format(ref)) - return False + logger.debug('{} ignored (not pdf)'.format(ident)) + return None - return True + return resp def download_references(filename): + local_bad_set = set() + logger.info('downloading references for {}'.format(filename)) + _rsrcmgr = PDFResourceManager() + with open(filename) as f: data = json.load(f) @@ -65,43 +75,33 @@ def download_references(filename): m.update(ref.encode('utf-8')) digest = m.hexdigest() + if digest in local_bad_set or digest in bad_set: + logger.debug('ref {} ignored (known bad)'.format(ref)) + continue + tgt_file = 'refdata/{}.txt'.format(digest) if os.path.exists(tgt_file) and not config.overwrite: logger.debug('ref {} ignored (file existed)'.format(ref)) continue - if not check_ref(ref): + resp = check_resp(partial(requests.head, ref, headers=req_headers, allow_redirects=True, timeout=1), + 'ref {}'.format(ref)) + if not resp: + local_bad_set.add(ref) continue logger.info('downloading ref {}'.format(ref)) - # noinspection PyBroadException - try: - resp = requests.get(ref, headers=req_headers, timeout=3, stream=True) - except requests.Timeout: - logger.info('ref {} timed out'.format(ref)) - continue - except requests.ConnectionError: - logger.info('ref {} failed to connect'.format(ref)) - continue - except: - logger.exception('unexpected exception reading pdf') + resp = check_resp(partial(requests.get, ref, headers=req_headers, timeout=3, stream=True), + 'ref {}'.format(ref)) + if not resp: + local_bad_set.add(ref) continue - if not (200 <= resp.status_code < 299): - logger.info('ref {} status {}').format(ref, resp.status_code) - continue - - cnt_type = resp.headers.get('content-type', '').lower() - if cnt_type != 'x-pdf' and cnt_type != 'application/pdf': - logger.debug('ref {} ignored (not pdf)'.format(ref)) - continue - - logger.info('ref {} successfully downloaded'.format(ref)) + logger.debug('ref {} successfully downloaded'.format(ref)) logger.debug('parsing ref {}'.format(ref)) - _rsrcmgr = PDFResourceManager() pdfstr = io.StringIO() _device = TextConverter(_rsrcmgr, pdfstr, codec='utf-8', laparams=LAParams()) pdfinterp = PDFPageInterpreter(_rsrcmgr, _device) @@ -120,6 +120,11 @@ def download_references(filename): logger.info('wrote text for {}'.format(ref)) + with open('bad.txt', 'a') as f: + f.writelines('{}\n'.format(elem) for elem in local_bad_set.difference(bad_set)) + + bad_set.update(local_bad_set) + def main(): if not os.path.exists('refdata'): @@ -133,16 +138,16 @@ def main(): logging.getLogger('urllib3').setLevel(logging.INFO) root_logger = logging.getLogger() - root_logger.setLevel(logging.DEBUG) + root_logger.setLevel(logging.INFO) stream_handler = logging.StreamHandler() stream_handler.setFormatter(logging.Formatter('%(name)s - %(levelname)s - %(message)s')) - stream_handler.setLevel(logging.INFO) + # stream_handler.setLevel(logging.INFO) file_handler = logging.FileHandler('logs/dl.log', mode='w') file_handler.setFormatter(logging.Formatter('[%(asctime)s | %(name)s | %(levelname)s] %(message)s', '%Y-%m-%d %H:%M:%S')) - file_handler.setLevel(logging.DEBUG) + # file_handler.setLevel(logging.INFO) root_logger.addHandler(stream_handler) root_logger.addHandler(file_handler) |