improve meta data scraping.

This commit is contained in:
Jason Schwarzenberger 2020-12-04 12:46:46 +13:00
parent fdb4494cd8
commit da7f6330bf
8 changed files with 74 additions and 109 deletions

View File

@ -12,7 +12,8 @@ import settings
from feeds import hackernews, reddit, tildes, substack, manual from feeds import hackernews, reddit, tildes, substack, manual
from feeds.sitemap import Sitemap from feeds.sitemap import Sitemap
from feeds.category import Category from feeds.category import Category
from scrapers import outline, declutter, headless, simple from scrapers import outline
from scrapers.declutter import declutter, headless, simple
INVALID_DOMAINS = ['youtube.com', 'bloomberg.com', 'wsj.com'] INVALID_DOMAINS = ['youtube.com', 'bloomberg.com', 'wsj.com']
@ -145,11 +146,17 @@ def update_story(story, is_manual=False, urlref=None):
logging.info('Getting article ' + story['url']) logging.info('Getting article ' + story['url'])
details, scraper = get_article(story['url']) details, scraper = get_article(story['url'])
if not details: return False if not details: return False
story['text'] = details.get('content', '')
story['excerpt'] = details.get('excerpt', '')
story['scraper'] = scraper story['scraper'] = scraper
story['scraper_link'] = details.get('scraper_link', '') story['text'] = details.get('content', '')
if not story['text']: return False if not story['text']: return False
story['excerpt'] = details.get('excerpt', '')
story['scraper_link'] = details.get('scraper_link', '')
meta = details.get('meta')
if meta:
og = meta.get('og')
story['image'] = meta.get('image', '')
if og:
story['image'] = og.get('og:image', meta.get('image', ''))
return True return True

View File

@ -6,7 +6,7 @@ logging.basicConfig(
import re import re
import requests import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from scrapers import declutter from scrapers.declutter import declutter, headless
import extruct import extruct
import settings import settings

View File

@ -4,38 +4,61 @@ logging.basicConfig(
level=logging.DEBUG) level=logging.DEBUG)
import requests import requests
DECLUTTER_API = 'https://declutter.1j.nz/headless/details' from settings import HEADLESS_READER_PORT, SIMPLE_READER_PORT
DECLUTTER_COMMENT_API = 'https://declutter.1j.nz/headless/comments'
TIMEOUT = 90 class Simple:
def __init__(self, host, name, internal=True, timeout=20):
self.host = host
self.name = name
self.internal = internal
self.timeout = timeout
self.variant = 'simple'
def as_readable(self, details):
if not self.internal:
details['scraper_link'] = self.host
return details
def get_html(self, url):
details = self.get_details(url)
if not details:
return ''
return details['content']
def get_details(self, url):
logging.info(f"{self.name} Scraper: {url}")
details = self._json(f"{self.host}/{self.variant}/details", dict(url=url), "article")
if not details: return None
return self.as_readable(details)
def get_html(url): def _json(self, url, data, adjective):
logging.info(f"Declutter Scraper: {url}") try:
details = get_details(url) r = requests.post(url, data=data, timeout=self.timeout)
if not details: if r.status_code != 200:
return '' raise Exception('Bad response code ' + str(r.status_code))
return details['content'] return r.json()
except KeyboardInterrupt:
raise
except BaseException as e:
logging.error('{}: Problem scraping {}: {}'.format(self.name, adjective, str(e)))
return None
def get_details(url):
try:
r = requests.post(DECLUTTER_API, data=dict(url=url), timeout=TIMEOUT)
if r.status_code != 200:
raise Exception('Bad response code ' + str(r.status_code))
return r.json()
except KeyboardInterrupt:
raise
except BaseException as e:
logging.error('Problem decluttering article: {}'.format(str(e)))
return None
def get_comments(url): class Headless(Simple):
try: def __init__(self, host, name, internal=True, timeout=90):
r = requests.post(DECLUTTER_COMMENT_API, data=dict(url=url), timeout=TIMEOUT) self.host = host
if r.status_code != 200: self.name = name
raise Exception('Bad response code ' + str(r.status_code)) self.internal = internal
return r.json() self.timeout = timeout
except KeyboardInterrupt: self.variant = 'headless'
raise
except BaseException as e: def get_comments(self, url):
logging.error('Problem getting comments for article: {}'.format(str(e))) logging.info(f"{self.name} Scraper: {url}")
return None comments = self._json(f"{self.host}/{self.variant}/comments", dict(url=url), "comments")
if not comments: return None
return comments
declutter = Headless('https://declutter.1j.nz', 'Declutter scraper', internal=False)
headless = Headless(f"http://127.0.0.1:{HEADLESS_READER_PORT or 33843}", 'Headless scraper')
simple = Simple(f"http://127.0.0.1:{SIMPLE_READER_PORT or 33843}", 'Simple scraper')

View File

@ -1,41 +0,0 @@
import logging
logging.basicConfig(
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
level=logging.DEBUG)
import requests
from settings import HEADLESS_READER_PORT
READ_API = 'http://127.0.0.1:{}/headless/details'.format(HEADLESS_READER_PORT or 33843)
READ_COMMENT__API = 'http://127.0.0.1:{}/headless/comments'.format(HEADLESS_READER_PORT or 33843)
TIMEOUT = 90
def get_html(url):
logging.info(f"Headless Scraper: {url}")
details = get_details(url)
if not details:
return ''
return details['content']
def get_details(url):
try:
r = requests.post(READ_API, data=dict(url=url), timeout=TIMEOUT)
if r.status_code != 200:
raise Exception('Bad response code ' + str(r.status_code))
return r.json()
except KeyboardInterrupt:
raise
except BaseException as e:
logging.error('Problem scraping article: {}'.format(str(e)))
return None
def get_comments(url):
try:
r = requests.post(READ_COMMENT_API, data=dict(url=url), timeout=TIMEOUT)
if r.status_code != 200:
raise Exception('Bad response code ' + str(r.status_code))
return r.json()
except KeyboardInterrupt:
raise
except BaseException as e:
logging.error('Problem getting comments for article: {}'.format(str(e)))
return None

View File

@ -29,8 +29,10 @@ def as_readable(details):
'siteName': details['site_name'], 'siteName': details['site_name'],
'url': details['article_url'], 'url': details['article_url'],
'publisher': details['site_name'], 'publisher': details['site_name'],
'scraper_link': 'https://outline.com/' + details['short_code'] 'scraper_link': 'https://outline.com/' + details['short_code'],
'meta': {}
} }
readable['meta'].update(details['meta'])
return readable return readable
def _get_outline(url): def _get_outline(url):
@ -40,8 +42,7 @@ def _get_outline(url):
headers = {'Referer': OUTLINE_REFERER} headers = {'Referer': OUTLINE_REFERER}
r = requests.get(OUTLINE_API, params=params, headers=headers, timeout=TIMEOUT) r = requests.get(OUTLINE_API, params=params, headers=headers, timeout=TIMEOUT)
if r.status_code == 429: if r.status_code == 429:
logging.info('Rate limited by outline, sleeping 30s and skipping...') logging.info('Rate limited by outline, skipping...')
time.sleep(30)
return None return None
if r.status_code != 200: if r.status_code != 200:
raise Exception('Bad response code ' + str(r.status_code)) raise Exception('Bad response code ' + str(r.status_code))

View File

@ -1,28 +0,0 @@
import logging
logging.basicConfig(
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
level=logging.DEBUG)
import requests
from settings import SIMPLE_READER_PORT
READ_API = 'http://127.0.0.1:{}/simple/details'.format(SIMPLE_READER_PORT or 33843)
TIMEOUT = 20
def get_html(url):
logging.info(f"Simple Scraper: {url}")
details = get_details(url)
if not details:
return ''
return details['content']
def get_details(url):
try:
r = requests.post(READ_API, data=dict(url=url), timeout=TIMEOUT)
if r.status_code != 200:
raise Exception('Bad response code ' + str(r.status_code))
return r.json()
except KeyboardInterrupt:
raise
except BaseException as e:
logging.error('Problem getting article: {}'.format(str(e)))
return None

@ -1 +1 @@
Subproject commit 1a81bc139f6e5f2fcb021ff47921e9c47eb3f6da Subproject commit 006be62214d89f33e1e99cd0d6af4e4e5e53e3b2

View File

@ -43,6 +43,9 @@
content={fromUnixTime(story.date).toISOString()} /> content={fromUnixTime(story.date).toISOString()} />
<meta property="article:author" content={story.author || story.source} /> <meta property="article:author" content={story.author || story.source} />
<meta property="og:description" content={story.excerpt || story.title} /> <meta property="og:description" content={story.excerpt || story.title} />
{#if story.image}
<meta property="og:image" content={story.image} />
{/if}
</svelte:head> </svelte:head>
<section class="single"> <section class="single">