From 850b30e353e634cb2f7eb7113414197966770a9d Mon Sep 17 00:00:00 2001 From: Tanner Collin Date: Sat, 4 Jul 2020 00:25:41 +0000 Subject: [PATCH] Add requests timeouts and temporary logging --- apiserver/feed.py | 25 ++++++++++++++++++++----- apiserver/feeds/tildes.py | 8 +++++++- apiserver/server.py | 2 +- 3 files changed, 28 insertions(+), 7 deletions(-) diff --git a/apiserver/feed.py b/apiserver/feed.py index 33d15b0..8682309 100644 --- a/apiserver/feed.py +++ b/apiserver/feed.py @@ -63,7 +63,7 @@ def get_first_image(text): first_img = soup.find('img') url = first_img['src'] headers = {'User-Agent': 'Twitterbot/1.0'} - length = requests.get(url, headers=headers).headers['content-length'] + length = requests.get(url, headers=headers, timeout=4).headers['content-length'] if int(length) > 1000000: raise return url except: @@ -72,9 +72,15 @@ def get_first_image(text): def get_content_type(url): try: headers = {'User-Agent': 'Twitterbot/1.0'} - return requests.get(url, headers=headers).headers['content-type'] + return requests.get(url, headers=headers, timeout=2).headers['content-type'] except: - return '' + pass + + try: + headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0'} + return requests.get(url, headers=headers, timeout=2).headers['content-type'] + except: + return 'text/' def update_story(story, is_manual=False): res = {} @@ -90,26 +96,35 @@ def update_story(story, is_manual=False): elif story['source'] == 'manual': res = manual.story(story['ref']) + logging.info('Got story') + if res: story.update(res) # join dicts else: - logging.info('Article not ready yet') + logging.info('Story not ready yet') return False + logging.info('story joined') + if story['date'] and not is_manual and story['date'] + TWO_DAYS < time.time(): - logging.info('Article too old, removing') + logging.info('Story too old, removing') return False + logging.info('story age good') + if story.get('url', '') and not story.get('text', ''): + logging.info('inside if') if not get_content_type(story['url']).startswith('text/'): logging.info('URL invalid file type / content type:') logging.info(story['url']) return False + logging.info('content type good') if any([domain in story['url'] for domain in INVALID_DOMAINS]): logging.info('URL invalid domain:') logging.info(story['url']) return False + logging.info('domain good') logging.info('Getting article ' + story['url']) story['text'] = get_article(story['url']) diff --git a/apiserver/feeds/tildes.py b/apiserver/feeds/tildes.py index 83a2e86..920c751 100644 --- a/apiserver/feeds/tildes.py +++ b/apiserver/feeds/tildes.py @@ -73,10 +73,14 @@ def story(ref): html = api(API_ITEM(ref)) if not html: return False + logging.info('Got Tildes html') + soup = BeautifulSoup(html, features='html.parser') a = soup.find('article', class_='topic-full') if a is None: return False + logging.info('Got article tag') + h = a.find('header') lu = h.find('a', class_='link-user') @@ -102,6 +106,8 @@ def story(ref): ch = a.find('header', class_='topic-comments-header') s['num_comments'] = int(ch.h2.string.split(' ')[0]) if ch else 0 + logging.info('Parsed html data') + if s['score'] < 8 and s['num_comments'] < 6: return False @@ -122,7 +128,7 @@ if __name__ == '__main__': #print(self_post) #li_comment = story('gqx') #print(li_comment) - broken = story('n03') + broken = story('q4y') print(broken) # make sure there's no self-reference diff --git a/apiserver/server.py b/apiserver/server.py index 659ced9..f7716d4 100644 --- a/apiserver/server.py +++ b/apiserver/server.py @@ -184,7 +184,7 @@ def feed_thread(): database.del_ref(item['ref']) logging.info('Removed ref {}'.format(item['ref'])) - gevent.sleep(6) + gevent.sleep(60) news_index += 1 if news_index == FEED_LENGTH: news_index = 0