improve meta data scraping.

This commit is contained in:
Jason Schwarzenberger 2020-12-04 12:46:46 +13:00
parent fdb4494cd8
commit da7f6330bf
8 changed files with 74 additions and 109 deletions

View File

@ -12,7 +12,8 @@ import settings
from feeds import hackernews, reddit, tildes, substack, manual
from feeds.sitemap import Sitemap
from feeds.category import Category
from scrapers import outline, declutter, headless, simple
from scrapers import outline
from scrapers.declutter import declutter, headless, simple
INVALID_DOMAINS = ['youtube.com', 'bloomberg.com', 'wsj.com']
@ -145,11 +146,17 @@ def update_story(story, is_manual=False, urlref=None):
logging.info('Getting article ' + story['url'])
details, scraper = get_article(story['url'])
if not details: return False
story['text'] = details.get('content', '')
story['excerpt'] = details.get('excerpt', '')
story['scraper'] = scraper
story['scraper_link'] = details.get('scraper_link', '')
story['text'] = details.get('content', '')
if not story['text']: return False
story['excerpt'] = details.get('excerpt', '')
story['scraper_link'] = details.get('scraper_link', '')
meta = details.get('meta')
if meta:
og = meta.get('og')
story['image'] = meta.get('image', '')
if og:
story['image'] = og.get('og:image', meta.get('image', ''))
return True

View File

@ -6,7 +6,7 @@ logging.basicConfig(
import re
import requests
from bs4 import BeautifulSoup
from scrapers import declutter
from scrapers.declutter import declutter, headless
import extruct
import settings

View File

@ -4,38 +4,61 @@ logging.basicConfig(
level=logging.DEBUG)
import requests
DECLUTTER_API = 'https://declutter.1j.nz/headless/details'
DECLUTTER_COMMENT_API = 'https://declutter.1j.nz/headless/comments'
TIMEOUT = 90
from settings import HEADLESS_READER_PORT, SIMPLE_READER_PORT
class Simple:
def __init__(self, host, name, internal=True, timeout=20):
self.host = host
self.name = name
self.internal = internal
self.timeout = timeout
self.variant = 'simple'
def get_html(url):
logging.info(f"Declutter Scraper: {url}")
details = get_details(url)
def as_readable(self, details):
if not self.internal:
details['scraper_link'] = self.host
return details
def get_html(self, url):
details = self.get_details(url)
if not details:
return ''
return details['content']
def get_details(url):
def get_details(self, url):
logging.info(f"{self.name} Scraper: {url}")
details = self._json(f"{self.host}/{self.variant}/details", dict(url=url), "article")
if not details: return None
return self.as_readable(details)
def _json(self, url, data, adjective):
try:
r = requests.post(DECLUTTER_API, data=dict(url=url), timeout=TIMEOUT)
r = requests.post(url, data=data, timeout=self.timeout)
if r.status_code != 200:
raise Exception('Bad response code ' + str(r.status_code))
return r.json()
except KeyboardInterrupt:
raise
except BaseException as e:
logging.error('Problem decluttering article: {}'.format(str(e)))
logging.error('{}: Problem scraping {}: {}'.format(self.name, adjective, str(e)))
return None
def get_comments(url):
try:
r = requests.post(DECLUTTER_COMMENT_API, data=dict(url=url), timeout=TIMEOUT)
if r.status_code != 200:
raise Exception('Bad response code ' + str(r.status_code))
return r.json()
except KeyboardInterrupt:
raise
except BaseException as e:
logging.error('Problem getting comments for article: {}'.format(str(e)))
return None
class Headless(Simple):
def __init__(self, host, name, internal=True, timeout=90):
self.host = host
self.name = name
self.internal = internal
self.timeout = timeout
self.variant = 'headless'
def get_comments(self, url):
logging.info(f"{self.name} Scraper: {url}")
comments = self._json(f"{self.host}/{self.variant}/comments", dict(url=url), "comments")
if not comments: return None
return comments
declutter = Headless('https://declutter.1j.nz', 'Declutter scraper', internal=False)
headless = Headless(f"http://127.0.0.1:{HEADLESS_READER_PORT or 33843}", 'Headless scraper')
simple = Simple(f"http://127.0.0.1:{SIMPLE_READER_PORT or 33843}", 'Simple scraper')

View File

@ -1,41 +0,0 @@
import logging
logging.basicConfig(
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
level=logging.DEBUG)
import requests
from settings import HEADLESS_READER_PORT
READ_API = 'http://127.0.0.1:{}/headless/details'.format(HEADLESS_READER_PORT or 33843)
READ_COMMENT__API = 'http://127.0.0.1:{}/headless/comments'.format(HEADLESS_READER_PORT or 33843)
TIMEOUT = 90
def get_html(url):
logging.info(f"Headless Scraper: {url}")
details = get_details(url)
if not details:
return ''
return details['content']
def get_details(url):
try:
r = requests.post(READ_API, data=dict(url=url), timeout=TIMEOUT)
if r.status_code != 200:
raise Exception('Bad response code ' + str(r.status_code))
return r.json()
except KeyboardInterrupt:
raise
except BaseException as e:
logging.error('Problem scraping article: {}'.format(str(e)))
return None
def get_comments(url):
try:
r = requests.post(READ_COMMENT_API, data=dict(url=url), timeout=TIMEOUT)
if r.status_code != 200:
raise Exception('Bad response code ' + str(r.status_code))
return r.json()
except KeyboardInterrupt:
raise
except BaseException as e:
logging.error('Problem getting comments for article: {}'.format(str(e)))
return None

View File

@ -29,8 +29,10 @@ def as_readable(details):
'siteName': details['site_name'],
'url': details['article_url'],
'publisher': details['site_name'],
'scraper_link': 'https://outline.com/' + details['short_code']
'scraper_link': 'https://outline.com/' + details['short_code'],
'meta': {}
}
readable['meta'].update(details['meta'])
return readable
def _get_outline(url):
@ -40,8 +42,7 @@ def _get_outline(url):
headers = {'Referer': OUTLINE_REFERER}
r = requests.get(OUTLINE_API, params=params, headers=headers, timeout=TIMEOUT)
if r.status_code == 429:
logging.info('Rate limited by outline, sleeping 30s and skipping...')
time.sleep(30)
logging.info('Rate limited by outline, skipping...')
return None
if r.status_code != 200:
raise Exception('Bad response code ' + str(r.status_code))

View File

@ -1,28 +0,0 @@
import logging
logging.basicConfig(
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
level=logging.DEBUG)
import requests
from settings import SIMPLE_READER_PORT
READ_API = 'http://127.0.0.1:{}/simple/details'.format(SIMPLE_READER_PORT or 33843)
TIMEOUT = 20
def get_html(url):
logging.info(f"Simple Scraper: {url}")
details = get_details(url)
if not details:
return ''
return details['content']
def get_details(url):
try:
r = requests.post(READ_API, data=dict(url=url), timeout=TIMEOUT)
if r.status_code != 200:
raise Exception('Bad response code ' + str(r.status_code))
return r.json()
except KeyboardInterrupt:
raise
except BaseException as e:
logging.error('Problem getting article: {}'.format(str(e)))
return None

@ -1 +1 @@
Subproject commit 1a81bc139f6e5f2fcb021ff47921e9c47eb3f6da
Subproject commit 006be62214d89f33e1e99cd0d6af4e4e5e53e3b2

View File

@ -43,6 +43,9 @@
content={fromUnixTime(story.date).toISOString()} />
<meta property="article:author" content={story.author || story.source} />
<meta property="og:description" content={story.excerpt || story.title} />
{#if story.image}
<meta property="og:image" content={story.image} />
{/if}
</svelte:head>
<section class="single">