improve meta data scraping.

This commit is contained in:
Jason Schwarzenberger
2020-12-04 12:46:46 +13:00
parent fdb4494cd8
commit da7f6330bf
8 changed files with 74 additions and 109 deletions

View File

@@ -4,38 +4,61 @@ logging.basicConfig(
level=logging.DEBUG)
import requests
DECLUTTER_API = 'https://declutter.1j.nz/headless/details'
DECLUTTER_COMMENT_API = 'https://declutter.1j.nz/headless/comments'
TIMEOUT = 90
from settings import HEADLESS_READER_PORT, SIMPLE_READER_PORT
class Simple:
def __init__(self, host, name, internal=True, timeout=20):
self.host = host
self.name = name
self.internal = internal
self.timeout = timeout
self.variant = 'simple'
def as_readable(self, details):
if not self.internal:
details['scraper_link'] = self.host
return details
def get_html(self, url):
details = self.get_details(url)
if not details:
return ''
return details['content']
def get_details(self, url):
logging.info(f"{self.name} Scraper: {url}")
details = self._json(f"{self.host}/{self.variant}/details", dict(url=url), "article")
if not details: return None
return self.as_readable(details)
def get_html(url):
logging.info(f"Declutter Scraper: {url}")
details = get_details(url)
if not details:
return ''
return details['content']
def _json(self, url, data, adjective):
try:
r = requests.post(url, data=data, timeout=self.timeout)
if r.status_code != 200:
raise Exception('Bad response code ' + str(r.status_code))
return r.json()
except KeyboardInterrupt:
raise
except BaseException as e:
logging.error('{}: Problem scraping {}: {}'.format(self.name, adjective, str(e)))
return None
def get_details(url):
try:
r = requests.post(DECLUTTER_API, data=dict(url=url), timeout=TIMEOUT)
if r.status_code != 200:
raise Exception('Bad response code ' + str(r.status_code))
return r.json()
except KeyboardInterrupt:
raise
except BaseException as e:
logging.error('Problem decluttering article: {}'.format(str(e)))
return None
def get_comments(url):
try:
r = requests.post(DECLUTTER_COMMENT_API, data=dict(url=url), timeout=TIMEOUT)
if r.status_code != 200:
raise Exception('Bad response code ' + str(r.status_code))
return r.json()
except KeyboardInterrupt:
raise
except BaseException as e:
logging.error('Problem getting comments for article: {}'.format(str(e)))
return None
class Headless(Simple):
def __init__(self, host, name, internal=True, timeout=90):
self.host = host
self.name = name
self.internal = internal
self.timeout = timeout
self.variant = 'headless'
def get_comments(self, url):
logging.info(f"{self.name} Scraper: {url}")
comments = self._json(f"{self.host}/{self.variant}/comments", dict(url=url), "comments")
if not comments: return None
return comments
declutter = Headless('https://declutter.1j.nz', 'Declutter scraper', internal=False)
headless = Headless(f"http://127.0.0.1:{HEADLESS_READER_PORT or 33843}", 'Headless scraper')
simple = Simple(f"http://127.0.0.1:{SIMPLE_READER_PORT or 33843}", 'Simple scraper')

View File

@@ -1,41 +0,0 @@
import logging
logging.basicConfig(
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
level=logging.DEBUG)
import requests
from settings import HEADLESS_READER_PORT
READ_API = 'http://127.0.0.1:{}/headless/details'.format(HEADLESS_READER_PORT or 33843)
READ_COMMENT__API = 'http://127.0.0.1:{}/headless/comments'.format(HEADLESS_READER_PORT or 33843)
TIMEOUT = 90
def get_html(url):
logging.info(f"Headless Scraper: {url}")
details = get_details(url)
if not details:
return ''
return details['content']
def get_details(url):
try:
r = requests.post(READ_API, data=dict(url=url), timeout=TIMEOUT)
if r.status_code != 200:
raise Exception('Bad response code ' + str(r.status_code))
return r.json()
except KeyboardInterrupt:
raise
except BaseException as e:
logging.error('Problem scraping article: {}'.format(str(e)))
return None
def get_comments(url):
try:
r = requests.post(READ_COMMENT_API, data=dict(url=url), timeout=TIMEOUT)
if r.status_code != 200:
raise Exception('Bad response code ' + str(r.status_code))
return r.json()
except KeyboardInterrupt:
raise
except BaseException as e:
logging.error('Problem getting comments for article: {}'.format(str(e)))
return None

View File

@@ -29,8 +29,10 @@ def as_readable(details):
'siteName': details['site_name'],
'url': details['article_url'],
'publisher': details['site_name'],
'scraper_link': 'https://outline.com/' + details['short_code']
'scraper_link': 'https://outline.com/' + details['short_code'],
'meta': {}
}
readable['meta'].update(details['meta'])
return readable
def _get_outline(url):
@@ -40,8 +42,7 @@ def _get_outline(url):
headers = {'Referer': OUTLINE_REFERER}
r = requests.get(OUTLINE_API, params=params, headers=headers, timeout=TIMEOUT)
if r.status_code == 429:
logging.info('Rate limited by outline, sleeping 30s and skipping...')
time.sleep(30)
logging.info('Rate limited by outline, skipping...')
return None
if r.status_code != 200:
raise Exception('Bad response code ' + str(r.status_code))

View File

@@ -1,28 +0,0 @@
import logging
logging.basicConfig(
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
level=logging.DEBUG)
import requests
from settings import SIMPLE_READER_PORT
READ_API = 'http://127.0.0.1:{}/simple/details'.format(SIMPLE_READER_PORT or 33843)
TIMEOUT = 20
def get_html(url):
logging.info(f"Simple Scraper: {url}")
details = get_details(url)
if not details:
return ''
return details['content']
def get_details(url):
try:
r = requests.post(READ_API, data=dict(url=url), timeout=TIMEOUT)
if r.status_code != 200:
raise Exception('Bad response code ' + str(r.status_code))
return r.json()
except KeyboardInterrupt:
raise
except BaseException as e:
logging.error('Problem getting article: {}'.format(str(e)))
return None