split categories, sitemap and other crap out of news.py

2020-11-16 15:30:33 +13:00
parent b80c1a5cb5
commit 6a91b9402f
8 changed files with 384 additions and 310 deletions
@@ -0,0 +1,35 @@
+import logging
+logging.basicConfig(
+        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+        level=logging.DEBUG)
+
+import requests
+
+USER_AGENT = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
+FORWARD_IP = '66.249.66.1'
+
+def xml(route, ref=None):
+    try:
+        headers = {'User-Agent': USER_AGENT, 'X-Forwarded-For': FORWARD_IP}
+        r = requests.get(route(ref), headers=headers, timeout=5)
+        if r.status_code != 200:
+            raise Exception('Bad response code ' + str(r.status_code))
+        return r.text
+    except KeyboardInterrupt:
+        raise
+    except BaseException as e:
+        logging.error('Problem hitting URL: {}'.format(str(e)))
+        return False
+
+def json(route, ref=None):
+    try:
+        headers = {'User-Agent': USER_AGENT, 'X-Forwarded-For': FORWARD_IP}
+        r = requests.get(route(ref), headers=headers, timeout=5)
+        if r.status_code != 200:
+            raise Exception('Bad response code ' + str(r.status_code))
+        return r.json()
+    except KeyboardInterrupt:
+        raise
+    except BaseException as e:
+        logging.error('Problem hitting URL: {}'.format(str(e)))
+        return False
@@ -0,0 +1,69 @@
+
+def parse_extruct(s, data):
+    rdfa_keys = {
+        'title': [
+            'http://ogp.me/ns#title',
+            'https://ogp.me/ns#title',
+        ],
+        'date': [
+            'http://ogp.me/ns/article#modified_time',
+            'https://ogp.me/ns/article#modified_time',
+            'http://ogp.me/ns/article#published_time',
+            'https://ogp.me/ns/article#published_time',
+        ]
+    }
+    for rdfa in data['rdfa']:
+        for key, props in rdfa.items():
+            for attribute, properties in rdfa_keys.items():
+                for prop in properties:
+                    if prop in props:
+                        for values in props[prop]:
+                            s[attribute] = values['@value']
+
+    for og in data['opengraph']:
+        titles = list(filter(None, [value if 'og:title' in key else None for key, value in og['properties']]))
+        modified = list(filter(None, [value if 'article:modified_time' in key else None for key, value in og['properties']]))
+        published = list(filter(None, [value if 'article:published_time' in key else None for key, value in og['properties']]))
+        if len(modified):
+            s['date'] = modified[0]
+        if len(published):
+            s['date'] = published[0]
+        if len(titles):
+            s['title'] = titles[0]
+
+    for md in data['microdata']:
+        if md['type'] in ['https://schema.org/NewsArticle', 'http://schema.org/NewsArticle']:
+            props = md['properties']
+            s['title'] = props['headline']
+            if props['dateModified']:
+                s['date'] = props['dateModified']
+            if props['datePublished']:
+                s['date'] = props['datePublished']
+            if 'author' in props and props['author']:
+                if 'properties' in props['author']:
+                    s['author'] = props['author']['properties']['name']
+                elif isinstance(props['author'], list):
+                    s['author'] = props['author'][0]['properties']['name']
+
+    for ld in data['json-ld']:
+        if '@type' in ld and ld['@type'] in ['Article', 'NewsArticle']:
+            s['title'] = ld['headline']
+            if ld['dateModified']:
+                s['date'] = ld['dateModified']
+            if ld['datePublished']:
+                s['date'] = ld['datePublished']
+            if 'author' in ld and ld['author']:
+                if 'name' in ld['author']:
+                    s['author'] = ld['author']['name']
+                elif isinstance(ld['author'], list):
+                    s['author'] = ld['author'][0]['name']
+        if '@graph' in ld:
+            for gld in ld['@graph']:
+                if '@type' in gld and gld['@type'] in ['Article', 'NewsArticle']:
+                    s['title'] = gld['headline']
+                    if gld['dateModified']:
+                        s['date'] = gld['dateModified']
+                    if gld['datePublished']:
+                        s['date'] = gld['datePublished']
+
+    return s
@@ -0,0 +1,18 @@
+import pytz
+import dateutil.parser
+
+
+TZINFOS = {
+    'NZDT': pytz.timezone('Pacific/Auckland'),
+    'NZST': pytz.timezone('Pacific/Auckland')
+}
+
+def unix(date_str, tz=None, tzinfos=TZINFOS):
+    try:
+        dt = dateutil.parser.parse(date_str, tzinfos=tzinfos)
+        if tz:
+            dt = pytz.timezone(tz).localize(dt)
+        return int(dt.timestamp())
+    except:
+        pass
+    return 0