improve meta data scraping.
This commit is contained in:
parent
fdb4494cd8
commit
da7f6330bf
|
@ -12,7 +12,8 @@ import settings
|
||||||
from feeds import hackernews, reddit, tildes, substack, manual
|
from feeds import hackernews, reddit, tildes, substack, manual
|
||||||
from feeds.sitemap import Sitemap
|
from feeds.sitemap import Sitemap
|
||||||
from feeds.category import Category
|
from feeds.category import Category
|
||||||
from scrapers import outline, declutter, headless, simple
|
from scrapers import outline
|
||||||
|
from scrapers.declutter import declutter, headless, simple
|
||||||
|
|
||||||
INVALID_DOMAINS = ['youtube.com', 'bloomberg.com', 'wsj.com']
|
INVALID_DOMAINS = ['youtube.com', 'bloomberg.com', 'wsj.com']
|
||||||
|
|
||||||
|
@ -145,11 +146,17 @@ def update_story(story, is_manual=False, urlref=None):
|
||||||
logging.info('Getting article ' + story['url'])
|
logging.info('Getting article ' + story['url'])
|
||||||
details, scraper = get_article(story['url'])
|
details, scraper = get_article(story['url'])
|
||||||
if not details: return False
|
if not details: return False
|
||||||
story['text'] = details.get('content', '')
|
|
||||||
story['excerpt'] = details.get('excerpt', '')
|
|
||||||
story['scraper'] = scraper
|
story['scraper'] = scraper
|
||||||
story['scraper_link'] = details.get('scraper_link', '')
|
story['text'] = details.get('content', '')
|
||||||
if not story['text']: return False
|
if not story['text']: return False
|
||||||
|
story['excerpt'] = details.get('excerpt', '')
|
||||||
|
story['scraper_link'] = details.get('scraper_link', '')
|
||||||
|
meta = details.get('meta')
|
||||||
|
if meta:
|
||||||
|
og = meta.get('og')
|
||||||
|
story['image'] = meta.get('image', '')
|
||||||
|
if og:
|
||||||
|
story['image'] = og.get('og:image', meta.get('image', ''))
|
||||||
|
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
|
@ -6,7 +6,7 @@ logging.basicConfig(
|
||||||
import re
|
import re
|
||||||
import requests
|
import requests
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from scrapers import declutter
|
from scrapers.declutter import declutter, headless
|
||||||
import extruct
|
import extruct
|
||||||
|
|
||||||
import settings
|
import settings
|
||||||
|
|
|
@ -4,38 +4,61 @@ logging.basicConfig(
|
||||||
level=logging.DEBUG)
|
level=logging.DEBUG)
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
DECLUTTER_API = 'https://declutter.1j.nz/headless/details'
|
from settings import HEADLESS_READER_PORT, SIMPLE_READER_PORT
|
||||||
DECLUTTER_COMMENT_API = 'https://declutter.1j.nz/headless/comments'
|
|
||||||
TIMEOUT = 90
|
class Simple:
|
||||||
|
def __init__(self, host, name, internal=True, timeout=20):
|
||||||
|
self.host = host
|
||||||
|
self.name = name
|
||||||
|
self.internal = internal
|
||||||
|
self.timeout = timeout
|
||||||
|
self.variant = 'simple'
|
||||||
|
|
||||||
|
def as_readable(self, details):
|
||||||
|
if not self.internal:
|
||||||
|
details['scraper_link'] = self.host
|
||||||
|
return details
|
||||||
|
|
||||||
|
def get_html(self, url):
|
||||||
|
details = self.get_details(url)
|
||||||
|
if not details:
|
||||||
|
return ''
|
||||||
|
return details['content']
|
||||||
|
|
||||||
|
def get_details(self, url):
|
||||||
|
logging.info(f"{self.name} Scraper: {url}")
|
||||||
|
details = self._json(f"{self.host}/{self.variant}/details", dict(url=url), "article")
|
||||||
|
if not details: return None
|
||||||
|
return self.as_readable(details)
|
||||||
|
|
||||||
|
|
||||||
def get_html(url):
|
def _json(self, url, data, adjective):
|
||||||
logging.info(f"Declutter Scraper: {url}")
|
try:
|
||||||
details = get_details(url)
|
r = requests.post(url, data=data, timeout=self.timeout)
|
||||||
if not details:
|
if r.status_code != 200:
|
||||||
return ''
|
raise Exception('Bad response code ' + str(r.status_code))
|
||||||
return details['content']
|
return r.json()
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
raise
|
||||||
|
except BaseException as e:
|
||||||
|
logging.error('{}: Problem scraping {}: {}'.format(self.name, adjective, str(e)))
|
||||||
|
return None
|
||||||
|
|
||||||
def get_details(url):
|
|
||||||
try:
|
|
||||||
r = requests.post(DECLUTTER_API, data=dict(url=url), timeout=TIMEOUT)
|
|
||||||
if r.status_code != 200:
|
|
||||||
raise Exception('Bad response code ' + str(r.status_code))
|
|
||||||
return r.json()
|
|
||||||
except KeyboardInterrupt:
|
|
||||||
raise
|
|
||||||
except BaseException as e:
|
|
||||||
logging.error('Problem decluttering article: {}'.format(str(e)))
|
|
||||||
return None
|
|
||||||
|
|
||||||
def get_comments(url):
|
class Headless(Simple):
|
||||||
try:
|
def __init__(self, host, name, internal=True, timeout=90):
|
||||||
r = requests.post(DECLUTTER_COMMENT_API, data=dict(url=url), timeout=TIMEOUT)
|
self.host = host
|
||||||
if r.status_code != 200:
|
self.name = name
|
||||||
raise Exception('Bad response code ' + str(r.status_code))
|
self.internal = internal
|
||||||
return r.json()
|
self.timeout = timeout
|
||||||
except KeyboardInterrupt:
|
self.variant = 'headless'
|
||||||
raise
|
|
||||||
except BaseException as e:
|
def get_comments(self, url):
|
||||||
logging.error('Problem getting comments for article: {}'.format(str(e)))
|
logging.info(f"{self.name} Scraper: {url}")
|
||||||
return None
|
comments = self._json(f"{self.host}/{self.variant}/comments", dict(url=url), "comments")
|
||||||
|
if not comments: return None
|
||||||
|
return comments
|
||||||
|
|
||||||
|
declutter = Headless('https://declutter.1j.nz', 'Declutter scraper', internal=False)
|
||||||
|
headless = Headless(f"http://127.0.0.1:{HEADLESS_READER_PORT or 33843}", 'Headless scraper')
|
||||||
|
simple = Simple(f"http://127.0.0.1:{SIMPLE_READER_PORT or 33843}", 'Simple scraper')
|
|
@ -1,41 +0,0 @@
|
||||||
import logging
|
|
||||||
logging.basicConfig(
|
|
||||||
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
|
||||||
level=logging.DEBUG)
|
|
||||||
import requests
|
|
||||||
from settings import HEADLESS_READER_PORT
|
|
||||||
|
|
||||||
READ_API = 'http://127.0.0.1:{}/headless/details'.format(HEADLESS_READER_PORT or 33843)
|
|
||||||
READ_COMMENT__API = 'http://127.0.0.1:{}/headless/comments'.format(HEADLESS_READER_PORT or 33843)
|
|
||||||
TIMEOUT = 90
|
|
||||||
|
|
||||||
def get_html(url):
|
|
||||||
logging.info(f"Headless Scraper: {url}")
|
|
||||||
details = get_details(url)
|
|
||||||
if not details:
|
|
||||||
return ''
|
|
||||||
return details['content']
|
|
||||||
|
|
||||||
def get_details(url):
|
|
||||||
try:
|
|
||||||
r = requests.post(READ_API, data=dict(url=url), timeout=TIMEOUT)
|
|
||||||
if r.status_code != 200:
|
|
||||||
raise Exception('Bad response code ' + str(r.status_code))
|
|
||||||
return r.json()
|
|
||||||
except KeyboardInterrupt:
|
|
||||||
raise
|
|
||||||
except BaseException as e:
|
|
||||||
logging.error('Problem scraping article: {}'.format(str(e)))
|
|
||||||
return None
|
|
||||||
|
|
||||||
def get_comments(url):
|
|
||||||
try:
|
|
||||||
r = requests.post(READ_COMMENT_API, data=dict(url=url), timeout=TIMEOUT)
|
|
||||||
if r.status_code != 200:
|
|
||||||
raise Exception('Bad response code ' + str(r.status_code))
|
|
||||||
return r.json()
|
|
||||||
except KeyboardInterrupt:
|
|
||||||
raise
|
|
||||||
except BaseException as e:
|
|
||||||
logging.error('Problem getting comments for article: {}'.format(str(e)))
|
|
||||||
return None
|
|
|
@ -29,8 +29,10 @@ def as_readable(details):
|
||||||
'siteName': details['site_name'],
|
'siteName': details['site_name'],
|
||||||
'url': details['article_url'],
|
'url': details['article_url'],
|
||||||
'publisher': details['site_name'],
|
'publisher': details['site_name'],
|
||||||
'scraper_link': 'https://outline.com/' + details['short_code']
|
'scraper_link': 'https://outline.com/' + details['short_code'],
|
||||||
|
'meta': {}
|
||||||
}
|
}
|
||||||
|
readable['meta'].update(details['meta'])
|
||||||
return readable
|
return readable
|
||||||
|
|
||||||
def _get_outline(url):
|
def _get_outline(url):
|
||||||
|
@ -40,8 +42,7 @@ def _get_outline(url):
|
||||||
headers = {'Referer': OUTLINE_REFERER}
|
headers = {'Referer': OUTLINE_REFERER}
|
||||||
r = requests.get(OUTLINE_API, params=params, headers=headers, timeout=TIMEOUT)
|
r = requests.get(OUTLINE_API, params=params, headers=headers, timeout=TIMEOUT)
|
||||||
if r.status_code == 429:
|
if r.status_code == 429:
|
||||||
logging.info('Rate limited by outline, sleeping 30s and skipping...')
|
logging.info('Rate limited by outline, skipping...')
|
||||||
time.sleep(30)
|
|
||||||
return None
|
return None
|
||||||
if r.status_code != 200:
|
if r.status_code != 200:
|
||||||
raise Exception('Bad response code ' + str(r.status_code))
|
raise Exception('Bad response code ' + str(r.status_code))
|
||||||
|
|
|
@ -1,28 +0,0 @@
|
||||||
import logging
|
|
||||||
logging.basicConfig(
|
|
||||||
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
|
||||||
level=logging.DEBUG)
|
|
||||||
import requests
|
|
||||||
from settings import SIMPLE_READER_PORT
|
|
||||||
|
|
||||||
READ_API = 'http://127.0.0.1:{}/simple/details'.format(SIMPLE_READER_PORT or 33843)
|
|
||||||
TIMEOUT = 20
|
|
||||||
|
|
||||||
def get_html(url):
|
|
||||||
logging.info(f"Simple Scraper: {url}")
|
|
||||||
details = get_details(url)
|
|
||||||
if not details:
|
|
||||||
return ''
|
|
||||||
return details['content']
|
|
||||||
|
|
||||||
def get_details(url):
|
|
||||||
try:
|
|
||||||
r = requests.post(READ_API, data=dict(url=url), timeout=TIMEOUT)
|
|
||||||
if r.status_code != 200:
|
|
||||||
raise Exception('Bad response code ' + str(r.status_code))
|
|
||||||
return r.json()
|
|
||||||
except KeyboardInterrupt:
|
|
||||||
raise
|
|
||||||
except BaseException as e:
|
|
||||||
logging.error('Problem getting article: {}'.format(str(e)))
|
|
||||||
return None
|
|
|
@ -1 +1 @@
|
||||||
Subproject commit 1a81bc139f6e5f2fcb021ff47921e9c47eb3f6da
|
Subproject commit 006be62214d89f33e1e99cd0d6af4e4e5e53e3b2
|
|
@ -43,6 +43,9 @@
|
||||||
content={fromUnixTime(story.date).toISOString()} />
|
content={fromUnixTime(story.date).toISOString()} />
|
||||||
<meta property="article:author" content={story.author || story.source} />
|
<meta property="article:author" content={story.author || story.source} />
|
||||||
<meta property="og:description" content={story.excerpt || story.title} />
|
<meta property="og:description" content={story.excerpt || story.title} />
|
||||||
|
{#if story.image}
|
||||||
|
<meta property="og:image" content={story.image} />
|
||||||
|
{/if}
|
||||||
</svelte:head>
|
</svelte:head>
|
||||||
|
|
||||||
<section class="single">
|
<section class="single">
|
||||||
|
|
Loading…
Reference in New Issue
Block a user