forked from tanner/qotnews
		
	improve meta data scraping.
This commit is contained in:
		| @@ -12,7 +12,8 @@ import settings | ||||
| from feeds import hackernews, reddit, tildes, substack, manual | ||||
| from feeds.sitemap import Sitemap | ||||
| from feeds.category import Category | ||||
| from scrapers import outline, declutter, headless, simple | ||||
| from scrapers import outline | ||||
| from scrapers.declutter import declutter, headless, simple | ||||
|  | ||||
| INVALID_DOMAINS = ['youtube.com', 'bloomberg.com', 'wsj.com'] | ||||
|  | ||||
| @@ -145,11 +146,17 @@ def update_story(story, is_manual=False, urlref=None): | ||||
|         logging.info('Getting article ' + story['url']) | ||||
|         details, scraper = get_article(story['url']) | ||||
|         if not details: return False | ||||
|         story['text'] = details.get('content', '') | ||||
|         story['excerpt'] = details.get('excerpt', '') | ||||
|         story['scraper'] = scraper | ||||
|         story['scraper_link'] = details.get('scraper_link', '') | ||||
|         story['text'] = details.get('content', '') | ||||
|         if not story['text']: return False | ||||
|         story['excerpt'] = details.get('excerpt', '') | ||||
|         story['scraper_link'] = details.get('scraper_link', '') | ||||
|         meta = details.get('meta') | ||||
|         if meta: | ||||
|             og = meta.get('og') | ||||
|             story['image'] = meta.get('image', '') | ||||
|             if og:  | ||||
|                 story['image'] = og.get('og:image', meta.get('image', '')) | ||||
|  | ||||
|     return True | ||||
|  | ||||
|   | ||||
| @@ -6,7 +6,7 @@ logging.basicConfig( | ||||
| import re | ||||
| import requests | ||||
| from bs4 import BeautifulSoup | ||||
| from scrapers import declutter | ||||
| from scrapers.declutter import declutter, headless | ||||
| import extruct | ||||
|  | ||||
| import settings | ||||
|   | ||||
| @@ -4,38 +4,61 @@ logging.basicConfig( | ||||
|         level=logging.DEBUG) | ||||
| import requests | ||||
|  | ||||
| DECLUTTER_API = 'https://declutter.1j.nz/headless/details' | ||||
| DECLUTTER_COMMENT_API = 'https://declutter.1j.nz/headless/comments' | ||||
| TIMEOUT = 90 | ||||
| from settings import HEADLESS_READER_PORT, SIMPLE_READER_PORT | ||||
|  | ||||
| class Simple: | ||||
|     def __init__(self, host, name, internal=True, timeout=20): | ||||
|         self.host = host | ||||
|         self.name = name | ||||
|         self.internal = internal | ||||
|         self.timeout = timeout | ||||
|         self.variant = 'simple' | ||||
|  | ||||
|     def as_readable(self, details): | ||||
|         if not self.internal: | ||||
|             details['scraper_link'] = self.host | ||||
|         return details | ||||
|  | ||||
|     def get_html(self, url): | ||||
|         details = self.get_details(url) | ||||
|         if not details: | ||||
|             return '' | ||||
|         return details['content'] | ||||
|  | ||||
|     def get_details(self, url): | ||||
|         logging.info(f"{self.name} Scraper: {url}") | ||||
|         details = self._json(f"{self.host}/{self.variant}/details", dict(url=url), "article") | ||||
|         if not details: return None | ||||
|         return self.as_readable(details) | ||||
|  | ||||
|  | ||||
| def get_html(url): | ||||
|     logging.info(f"Declutter Scraper: {url}") | ||||
|     details = get_details(url) | ||||
|     if not details: | ||||
|         return '' | ||||
|     return details['content'] | ||||
|     def _json(self, url, data, adjective): | ||||
|         try: | ||||
|             r = requests.post(url, data=data, timeout=self.timeout) | ||||
|             if r.status_code != 200: | ||||
|                 raise Exception('Bad response code ' + str(r.status_code)) | ||||
|             return r.json() | ||||
|         except KeyboardInterrupt: | ||||
|             raise | ||||
|         except BaseException as e: | ||||
|             logging.error('{}: Problem scraping {}: {}'.format(self.name, adjective, str(e))) | ||||
|             return None | ||||
|  | ||||
| def get_details(url): | ||||
|     try: | ||||
|         r = requests.post(DECLUTTER_API, data=dict(url=url), timeout=TIMEOUT) | ||||
|         if r.status_code != 200: | ||||
|             raise Exception('Bad response code ' + str(r.status_code)) | ||||
|         return r.json() | ||||
|     except KeyboardInterrupt: | ||||
|         raise | ||||
|     except BaseException as e: | ||||
|         logging.error('Problem decluttering article: {}'.format(str(e))) | ||||
|         return None | ||||
|  | ||||
| def get_comments(url): | ||||
|     try: | ||||
|         r = requests.post(DECLUTTER_COMMENT_API, data=dict(url=url), timeout=TIMEOUT) | ||||
|         if r.status_code != 200: | ||||
|             raise Exception('Bad response code ' + str(r.status_code)) | ||||
|         return r.json() | ||||
|     except KeyboardInterrupt: | ||||
|         raise | ||||
|     except BaseException as e: | ||||
|         logging.error('Problem getting comments for article: {}'.format(str(e))) | ||||
|         return None | ||||
| class Headless(Simple): | ||||
|     def __init__(self, host, name, internal=True, timeout=90): | ||||
|         self.host = host | ||||
|         self.name = name | ||||
|         self.internal = internal | ||||
|         self.timeout = timeout | ||||
|         self.variant = 'headless' | ||||
|  | ||||
|     def get_comments(self, url): | ||||
|         logging.info(f"{self.name} Scraper: {url}") | ||||
|         comments = self._json(f"{self.host}/{self.variant}/comments", dict(url=url), "comments") | ||||
|         if not comments: return None | ||||
|         return comments | ||||
|  | ||||
| declutter = Headless('https://declutter.1j.nz', 'Declutter scraper', internal=False) | ||||
| headless = Headless(f"http://127.0.0.1:{HEADLESS_READER_PORT or 33843}", 'Headless scraper') | ||||
| simple = Simple(f"http://127.0.0.1:{SIMPLE_READER_PORT or 33843}", 'Simple scraper') | ||||
| @@ -1,41 +0,0 @@ | ||||
| import logging | ||||
| logging.basicConfig( | ||||
|         format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', | ||||
|         level=logging.DEBUG) | ||||
| import requests | ||||
| from settings import HEADLESS_READER_PORT | ||||
|  | ||||
| READ_API = 'http://127.0.0.1:{}/headless/details'.format(HEADLESS_READER_PORT or 33843) | ||||
| READ_COMMENT__API = 'http://127.0.0.1:{}/headless/comments'.format(HEADLESS_READER_PORT or 33843) | ||||
| TIMEOUT = 90 | ||||
|  | ||||
| def get_html(url): | ||||
|     logging.info(f"Headless Scraper: {url}") | ||||
|     details = get_details(url) | ||||
|     if not details: | ||||
|         return '' | ||||
|     return details['content'] | ||||
|  | ||||
| def get_details(url): | ||||
|     try: | ||||
|         r = requests.post(READ_API, data=dict(url=url), timeout=TIMEOUT) | ||||
|         if r.status_code != 200: | ||||
|             raise Exception('Bad response code ' + str(r.status_code)) | ||||
|         return r.json() | ||||
|     except KeyboardInterrupt: | ||||
|         raise | ||||
|     except BaseException as e: | ||||
|         logging.error('Problem scraping article: {}'.format(str(e))) | ||||
|         return None | ||||
|  | ||||
| def get_comments(url): | ||||
|     try: | ||||
|         r = requests.post(READ_COMMENT_API, data=dict(url=url), timeout=TIMEOUT) | ||||
|         if r.status_code != 200: | ||||
|             raise Exception('Bad response code ' + str(r.status_code)) | ||||
|         return r.json() | ||||
|     except KeyboardInterrupt: | ||||
|         raise | ||||
|     except BaseException as e: | ||||
|         logging.error('Problem getting comments for article: {}'.format(str(e))) | ||||
|         return None | ||||
| @@ -29,8 +29,10 @@ def as_readable(details): | ||||
|         'siteName': details['site_name'], | ||||
|         'url': details['article_url'], | ||||
|         'publisher': details['site_name'], | ||||
|         'scraper_link': 'https://outline.com/' + details['short_code'] | ||||
|         'scraper_link': 'https://outline.com/' + details['short_code'], | ||||
|         'meta': {} | ||||
|     } | ||||
|     readable['meta'].update(details['meta']) | ||||
|     return readable | ||||
|  | ||||
| def _get_outline(url): | ||||
| @@ -40,8 +42,7 @@ def _get_outline(url): | ||||
|         headers = {'Referer': OUTLINE_REFERER} | ||||
|         r = requests.get(OUTLINE_API, params=params, headers=headers, timeout=TIMEOUT) | ||||
|         if r.status_code == 429: | ||||
|             logging.info('Rate limited by outline, sleeping 30s and skipping...') | ||||
|             time.sleep(30) | ||||
|             logging.info('Rate limited by outline, skipping...') | ||||
|             return None | ||||
|         if r.status_code != 200: | ||||
|             raise Exception('Bad response code ' + str(r.status_code)) | ||||
|   | ||||
| @@ -1,28 +0,0 @@ | ||||
| import logging | ||||
| logging.basicConfig( | ||||
|         format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', | ||||
|         level=logging.DEBUG) | ||||
| import requests | ||||
| from settings import SIMPLE_READER_PORT | ||||
|  | ||||
| READ_API = 'http://127.0.0.1:{}/simple/details'.format(SIMPLE_READER_PORT or 33843) | ||||
| TIMEOUT = 20 | ||||
|  | ||||
| def get_html(url): | ||||
|     logging.info(f"Simple Scraper: {url}") | ||||
|     details = get_details(url) | ||||
|     if not details: | ||||
|         return '' | ||||
|     return details['content'] | ||||
|  | ||||
| def get_details(url): | ||||
|     try: | ||||
|         r = requests.post(READ_API, data=dict(url=url), timeout=TIMEOUT) | ||||
|         if r.status_code != 200: | ||||
|             raise Exception('Bad response code ' + str(r.status_code)) | ||||
|         return r.json() | ||||
|     except KeyboardInterrupt: | ||||
|         raise | ||||
|     except BaseException as e: | ||||
|         logging.error('Problem getting article: {}'.format(str(e))) | ||||
|         return None | ||||
 Submodule readerserver updated: 1a81bc139f...006be62214
									
								
							| @@ -43,6 +43,9 @@ | ||||
|     content={fromUnixTime(story.date).toISOString()} /> | ||||
|   <meta property="article:author" content={story.author || story.source} /> | ||||
|   <meta property="og:description" content={story.excerpt || story.title} /> | ||||
|   {#if story.image} | ||||
|     <meta property="og:image" content={story.image} /> | ||||
|   {/if} | ||||
| </svelte:head> | ||||
|  | ||||
| <section class="single"> | ||||
|   | ||||
		Reference in New Issue
	
	Block a user