reduce logging and add bad_set

author: Nathan Perry <avaglir@gmail.com> 2017-12-08 14:37:07 -0500
committer: Nathan Perry <avaglir@gmail.com> 2017-12-08 14:37:07 -0500
commit: fa549476445594e1feeb05b7782bb954e09f7580 (patch)
tree: de7e94ded6ce92011f7897d5ffcdab4324472ca2
parent: 821f882320ad5df5ba10681b322fc7b3b8fd25ca (diff)
2 files changed, 45 insertions, 39 deletions
diff --git a/.gitignore b/.gitignore
index 3f9358e..358b100 100644
--- a/.gitignore
+++ b/.gitignore
@@ -111,3 +111,4 @@ splits
 refdata
 logs
 wiki.xml
+bad.txt
diff --git a/download_references.py b/download_references.py
index cb09552..0436a8a 100644
--- a/download_references.py
+++ b/download_references.py
@@ -5,6 +5,7 @@ import os
 import unicodedata
 from argparse import ArgumentParser
 from concurrent.futures import ProcessPoolExecutor
+from functools import partial
 from hashlib import md5
 
 import requests
@@ -24,36 +25,45 @@ config = arg_parser.parse_args()
 
 req_headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0'}
 
+bad_set = set()
+if os.path.exists('bad.txt'):
+    with open('bad.txt') as bad_f:
+        bad_set = {line.strip() for line in bad_f.readlines()}
 
-def check_ref(ref):
+
+def check_resp(requests_callable, ident):
     # noinspection PyBroadException
     try:
-        resp = requests.head(ref, headers=req_headers, allow_redirects=True, timeout=1)
+        resp = requests_callable()
     except requests.Timeout:
-        logger.info('ref {} timed out'.format(ref))
-        return False
+        logger.info('{} timed out'.format(ident))
+        return None
     except requests.ConnectionError:
-        logger.info('ref {} failed to connect'.format(ref))
-        return False
+        logger.info('{} failed to connect'.format(ident))
+        return None
     except:
-        logger.exception('unexpected exception reading pdf head')
-        return False
+        logger.exception('unexpected exception in request')
+        return None
 
     if not (200 <= resp.status_code < 299):
-        logger.info('ref {} status {}'.format(ref, resp.status_code))
-        return False
+        logger.info('{} status {}'.format(ident, resp.status_code))
+        return None
 
     cnt_type = resp.headers.get('content-type', '').lower()
     if cnt_type != 'x-pdf' and cnt_type != 'application/pdf':
-        logger.debug('ref {} ignored (not pdf)'.format(ref))
-        return False
+        logger.debug('{} ignored (not pdf)'.format(ident))
+        return None
 
-    return True
+    return resp
 
 
 def download_references(filename):
+    local_bad_set = set()
+
     logger.info('downloading references for {}'.format(filename))
 
+    _rsrcmgr = PDFResourceManager()
+
     with open(filename) as f:
         data = json.load(f)
 
@@ -65,43 +75,33 @@ def download_references(filename):
             m.update(ref.encode('utf-8'))
             digest = m.hexdigest()
 
+            if digest in local_bad_set or digest in bad_set:
+                logger.debug('ref {} ignored (known bad)'.format(ref))
+                continue
+
             tgt_file = 'refdata/{}.txt'.format(digest)
 
             if os.path.exists(tgt_file) and not config.overwrite:
                 logger.debug('ref {} ignored (file existed)'.format(ref))
                 continue
 
-            if not check_ref(ref):
+            resp = check_resp(partial(requests.head, ref, headers=req_headers, allow_redirects=True, timeout=1),
+                              'ref {}'.format(ref))
+            if not resp:
+                local_bad_set.add(ref)
                 continue
 
             logger.info('downloading ref {}'.format(ref))
-            # noinspection PyBroadException
-            try:
-                resp = requests.get(ref, headers=req_headers, timeout=3, stream=True)
-            except requests.Timeout:
-                logger.info('ref {} timed out'.format(ref))
-                continue
-            except requests.ConnectionError:
-                logger.info('ref {} failed to connect'.format(ref))
-                continue
-            except:
-                logger.exception('unexpected exception reading pdf')
+            resp = check_resp(partial(requests.get, ref, headers=req_headers, timeout=3, stream=True),
+                              'ref {}'.format(ref))
+            if not resp:
+                local_bad_set.add(ref)
                 continue
 
-            if not (200 <= resp.status_code < 299):
-                logger.info('ref {} status {}').format(ref, resp.status_code)
-                continue
-
-            cnt_type = resp.headers.get('content-type', '').lower()
-            if cnt_type != 'x-pdf' and cnt_type != 'application/pdf':
-                logger.debug('ref {} ignored (not pdf)'.format(ref))
-                continue
-
-            logger.info('ref {} successfully downloaded'.format(ref))
+            logger.debug('ref {} successfully downloaded'.format(ref))
 
             logger.debug('parsing ref {}'.format(ref))
 
-            _rsrcmgr = PDFResourceManager()
             pdfstr = io.StringIO()
             _device = TextConverter(_rsrcmgr, pdfstr, codec='utf-8', laparams=LAParams())
             pdfinterp = PDFPageInterpreter(_rsrcmgr, _device)
@@ -120,6 +120,11 @@ def download_references(filename):
 
             logger.info('wrote text for {}'.format(ref))
 
+    with open('bad.txt', 'a') as f:
+        f.writelines('{}\n'.format(elem) for elem in local_bad_set.difference(bad_set))
+
+    bad_set.update(local_bad_set)
+
 
 def main():
     if not os.path.exists('refdata'):
@@ -133,16 +138,16 @@ def main():
     logging.getLogger('urllib3').setLevel(logging.INFO)
 
     root_logger = logging.getLogger()
-    root_logger.setLevel(logging.DEBUG)
+    root_logger.setLevel(logging.INFO)
 
     stream_handler = logging.StreamHandler()
     stream_handler.setFormatter(logging.Formatter('%(name)s - %(levelname)s - %(message)s'))
-    stream_handler.setLevel(logging.INFO)
+    # stream_handler.setLevel(logging.INFO)
 
     file_handler = logging.FileHandler('logs/dl.log', mode='w')
     file_handler.setFormatter(logging.Formatter('[%(asctime)s | %(name)s | %(levelname)s] %(message)s',
                                                 '%Y-%m-%d %H:%M:%S'))
-    file_handler.setLevel(logging.DEBUG)
+    # file_handler.setLevel(logging.INFO)
 
     root_logger.addHandler(stream_handler)
     root_logger.addHandler(file_handler)
author	Nathan Perry <avaglir@gmail.com>	2017-12-08 14:37:07 -0500
committer	Nathan Perry <avaglir@gmail.com>	2017-12-08 14:37:07 -0500
commit	fa549476445594e1feeb05b7782bb954e09f7580 (patch)
tree	de7e94ded6ce92011f7897d5ffcdab4324472ca2
parent	821f882320ad5df5ba10681b322fc7b3b8fd25ca (diff)