forked from tanner/qotnews
Compare commits
24 Commits
2f730c1f52
...
2439c113b3
Author | SHA1 | Date |
---|---|---|
Jason Schwarzenberger | 2439c113b3 | 4 years ago |
Jason Schwarzenberger | 0f5e28136d | 4 years ago |
Jason Schwarzenberger | bb1413b586 | 4 years ago |
Jason Schwarzenberger | 0a27c0da1f | 4 years ago |
Jason Schwarzenberger | fe01ea52e5 | 4 years ago |
Jason Schwarzenberger | 3daae5fa1b | 4 years ago |
Jason Schwarzenberger | 25caee17d6 | 4 years ago |
Jason Schwarzenberger | c1b6349771 | 4 years ago |
Jason | 54a4c7e55a | 4 years ago |
Jason | b12a3570b0 | 4 years ago |
Jason Schwarzenberger | 0bfa920654 | 4 years ago |
Jason Schwarzenberger | 9341b4d966 | 4 years ago |
Jason Schwarzenberger | a2e5faa3b5 | 4 years ago |
Jason Schwarzenberger | a86eb98c1a | 4 years ago |
Jason Schwarzenberger | abf7f0a802 | 4 years ago |
Jason Schwarzenberger | d288546d6f | 4 years ago |
Jason Schwarzenberger | cc130942ca | 4 years ago |
Jason Schwarzenberger | f0b14408d4 | 4 years ago |
Jason Schwarzenberger | e1830a589b | 4 years ago |
Jason Schwarzenberger | 32bc3b906b | 4 years ago |
Jason Schwarzenberger | f5e65632b8 | 4 years ago |
Jason Schwarzenberger | 1fe524207e | 4 years ago |
Jason Schwarzenberger | dc3d17b171 | 4 years ago |
Jason Schwarzenberger | 539350a83d | 4 years ago |
25 changed files with 3389 additions and 2612 deletions
@ -0,0 +1,14 @@ |
||||
from bs4 import BeautifulSoup |
||||
|
||||
def get_icons(markup): |
||||
soup = BeautifulSoup(markup, features='html.parser') |
||||
icon32 = soup.find_all('link', rel="icon", href=True, sizes="32x32") |
||||
icon16 = soup.find_all('link', rel="icon", href=True, sizes="16x16") |
||||
favicon = soup.find_all('link', rel="shortcut icon", href=True) |
||||
others = soup.find_all('link', rel="icon", href=True) |
||||
icons = icon32 + icon16 + favicon + others |
||||
base_url = '/'.join(urlref.split('/')[:3]) |
||||
icons = list(set([i.get('href') for i in icons])) |
||||
icons = [i if i.startswith('http') else base_url + i for i in icons] |
||||
|
||||
return icons |
@ -0,0 +1,64 @@ |
||||
import re |
||||
from bs4 import BeautifulSoup |
||||
|
||||
if __name__ == '__main__': |
||||
import sys |
||||
sys.path.insert(0,'.') |
||||
|
||||
from misc.time import unix |
||||
from misc.api import xml |
||||
|
||||
def _soup_get_text(soup): |
||||
if not soup: return None |
||||
if soup.text: return soup.text |
||||
|
||||
s = soup.find(text=lambda tag: isinstance(tag, bs4.CData)) |
||||
if s and s.string: return s.string.strip() |
||||
return None |
||||
|
||||
def _parse_comment(soup): |
||||
c = { |
||||
'author': '', |
||||
'authorLink': '', |
||||
'score': 0, |
||||
'date': 0, |
||||
'text': '', |
||||
'comments': [], |
||||
} |
||||
|
||||
if soup.find('link'): |
||||
title = _soup_get_text(soup.find('link')) |
||||
if title and 'By:' in title: |
||||
c['author'] = title.strip('By:').strip() |
||||
if soup.find('dc:creator'): |
||||
c['author'] = _soup_get_text(soup.find('dc:creator')) |
||||
if soup.find('link'): |
||||
c['authorLink'] = _soup_get_text(soup.find('link')) |
||||
if soup.find('description'): |
||||
c['text'] = _soup_get_text(soup.find('description')) |
||||
if soup.find('pubdate'): |
||||
c['date'] = unix(soup.find('pubdate').text) |
||||
elif soup.find('pubDate'): |
||||
c['date'] = unix(soup.find('pubDate').text) |
||||
|
||||
return c |
||||
|
||||
def get_comments(url): |
||||
regex = r"https:\/\/www\.stuff\.co\.nz\/(.*\/\d+)/[^\/]+" |
||||
p = re.compile(regex).match(url) |
||||
path = p.groups()[0] |
||||
comment_url = f'https://comments.us1.gigya.com/comments/rss/6201101/Stuff/stuff/{path}' |
||||
markup = xml(lambda x: comment_url) |
||||
if not markup: return [] |
||||
soup = BeautifulSoup(markup, features='html.parser') |
||||
comments = soup.find_all('item') |
||||
if not comments: return [] |
||||
comments = [_parse_comment(c) for c in comments] |
||||
return comments |
||||
|
||||
|
||||
# scratchpad so I can quickly develop the parser |
||||
if __name__ == '__main__': |
||||
comments = get_comments('https://www.stuff.co.nz/life-style/homed/houses/123418468/dear-jacinda-we-need-to-talk-about-housing') |
||||
print(len(comments)) |
||||
print(comments[:5]) |
@ -0,0 +1,48 @@ |
||||
import logging |
||||
logging.basicConfig( |
||||
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', |
||||
level=logging.INFO) |
||||
|
||||
import sys |
||||
import json |
||||
import requests |
||||
|
||||
import database |
||||
import feed |
||||
import search |
||||
|
||||
database.init() |
||||
search.init() |
||||
|
||||
def _update_current_story(story, item): |
||||
logging.info('Updating story: {}'.format(str(story['ref']))) |
||||
|
||||
if story.get('url', ''): |
||||
story['text'] = '' |
||||
|
||||
valid = feed.update_story(story, urlref=item['urlref']) |
||||
if valid: |
||||
database.put_story(story) |
||||
search.put_story(story) |
||||
else: |
||||
database.del_ref(item['ref']) |
||||
logging.info('Removed ref {}'.format(item['ref'])) |
||||
|
||||
if __name__ == '__main__': |
||||
if len(sys.argv) == 2: |
||||
sid = sys.argv[1] |
||||
else: |
||||
print('Usage: python delete-story.py [story id]') |
||||
exit(1) |
||||
|
||||
item = database.get_ref_by_sid(sid) |
||||
|
||||
if item: |
||||
story = database.get_story(item['sid']).data |
||||
if story: |
||||
print('Updating story:') |
||||
_update_current_story(story, item) |
||||
else: |
||||
print('Story not found. Exiting.') |
||||
else: |
||||
print('Story not found. Exiting.') |
@ -1 +1 @@ |
||||
Subproject commit 50a94df7283e31680c5d94dd666bab58aea2e475 |
||||
Subproject commit d3d5fc74acf0be8a49e2772b42ab59278d1a3e81 |
File diff suppressed because it is too large
Load Diff
Loading…
Reference in new issue