renaming things.

master
Jason Schwarzenberger 4 years ago
parent 55d50a86d8
commit 3b885e4327
  1. 6
      .gitmodules
  2. 14
      apiserver/feed.py
  3. 8
      apiserver/scrapers/headless.py
  4. 4
      apiserver/scrapers/simple.py
  5. 2
      apiserver/settings.py.example
  6. 20
      readerserver/main.js
  7. 1
      readerserver/scraper/browser/scripts/bypass-paywalls-chrome
  8. 6
      readerserver/scraper/headless/_browser.js
  9. 0
      readerserver/scraper/headless/_comments.js
  10. 0
      readerserver/scraper/headless/index.js
  11. 1
      readerserver/scraper/headless/scripts/bypass-paywalls-chrome
  12. 0
      readerserver/scraper/headless/scripts/cosmetic-filters.js
  13. 0
      readerserver/scraper/headless/scripts/fix-relative-links.js

6
.gitmodules vendored

@ -1,3 +1,3 @@
[submodule "readerserver/scraper/browser/scripts/bypass-paywalls-chrome"] [submodule "readerserver/scraper/headless/scripts/bypass-paywalls-chrome"]
path = readerserver/scraper/browser/scripts/bypass-paywalls-chrome path = readerserver/scraper/headless/scripts/bypass-paywalls-chrome
url = https://github.com/iamadamdev/bypass-paywalls-chrome.git url = https://github.com/iamadamdev/bypass-paywalls-chrome/

@ -12,7 +12,7 @@ import settings
from feeds import hackernews, reddit, tildes, substack, manual from feeds import hackernews, reddit, tildes, substack, manual
from feeds.sitemap import Sitemap from feeds.sitemap import Sitemap
from feeds.category import Category from feeds.category import Category
from scrapers import outline, declutter, browser, local from scrapers import outline, declutter, headless, simple
INVALID_DOMAINS = ['youtube.com', 'bloomberg.com', 'wsj.com'] INVALID_DOMAINS = ['youtube.com', 'bloomberg.com', 'wsj.com']
@ -63,14 +63,14 @@ def get_list():
def get_article(url): def get_article(url):
scrapers = { scrapers = {
'declutter': declutter, 'headless': headless,
'simple': simple,
'outline': outline, 'outline': outline,
'browser': browser, 'declutter': declutter,
'local': local,
} }
available = settings.SCRAPERS or ['local'] available = settings.SCRAPERS or ['headless', 'simple']
if 'local' not in available: if 'simple' not in available:
available += ['local'] available += ['simple']
for scraper in available: for scraper in available:
if scraper not in scrapers.keys(): if scraper not in scrapers.keys():

@ -4,13 +4,13 @@ logging.basicConfig(
level=logging.DEBUG) level=logging.DEBUG)
import requests import requests
READ_API = 'http://127.0.0.1:33843/browser/details' READ_API = 'http://127.0.0.1:33843/headless/details'
READ_COMMENT__API = 'http://127.0.0.1:33843/browser/commentd' READ_COMMENT__API = 'http://127.0.0.1:33843/headless/comments'
TIMEOUT = 60 TIMEOUT = 60
def get_html(url): def get_html(url):
logging.info(f"Reader Scraper: {url}") logging.info(f"Headless Browser Scraper: {url}")
details = get_details(url) details = get_details(url)
if not details: if not details:
return '' return ''
@ -25,7 +25,7 @@ def get_details(url):
except KeyboardInterrupt: except KeyboardInterrupt:
raise raise
except BaseException as e: except BaseException as e:
logging.error('Problem Scraping article: {}'.format(str(e))) logging.error('Problem scraping article: {}'.format(str(e)))
return None return None
def get_comments(url): def get_comments(url):

@ -4,11 +4,11 @@ logging.basicConfig(
level=logging.DEBUG) level=logging.DEBUG)
import requests import requests
READ_API = 'http://127.0.0.1:33843/details' READ_API = 'http://127.0.0.1:33843/simple/details'
TIMEOUT = 20 TIMEOUT = 20
def get_html(url): def get_html(url):
logging.info(f"Local Scraper: {url}") logging.info(f"Simple Scraper: {url}")
details = get_details(url) details = get_details(url)
if not details: if not details:
return '' return ''

@ -51,7 +51,7 @@ CATEGORY = {}
# ], # ],
# } # }
SCRAPERS = ['browser', 'declutter', 'outline', 'local'] SCRAPERS = ['headless', 'outline', 'declutter', 'simple']
# Reddit account info # Reddit account info
# leave blank if not using Reddit # leave blank if not using Reddit

@ -2,12 +2,18 @@ const port = 33843;
const express = require('express'); const express = require('express');
const app = express(); const app = express();
const simple = require('./scraper/simple'); const simple = require('./scraper/simple');
const browser = require('./scraper/browser'); const headless = require('./scraper/headless');
app.use(express.urlencoded({ extended: true })); app.use(express.urlencoded({ extended: true }));
app.get('/', (req, res) => { app.get('/', (req, res) => {
const routes = ['/', '/details', '/browser', '/browser/details', '/browser/comments']; const routes = [
'/simple',
'/simple/details',
'/headless',
'/headless/details',
'/headless/comments'
];
const html = routes.map(route => ` const html = routes.map(route => `
<form method="POST" action="${route}" accept-charset="UTF-8"> <form method="POST" action="${route}" accept-charset="UTF-8">
@ -19,11 +25,11 @@ app.get('/', (req, res) => {
</form>`).join('<hr />'); </form>`).join('<hr />');
res.send(html); res.send(html);
}); });
app.post('/', simple.scrape); app.post('/simple/', simple.scrape);
app.post('/details', simple.details); app.post('/simple/details', simple.details);
app.post('/browser', browser.scrape); app.post('/headless', headless.scrape);
app.post('/browser/details', browser.details); app.post('/headless/details', headless.details);
app.post('/browser/comments', browser.comments); app.post('/headless/comments', headless.comments);
app.listen(port, () => { app.listen(port, () => {
console.log(`Example app listening on port ${port}!`); console.log(`Example app listening on port ${port}!`);

@ -1 +0,0 @@
Subproject commit 0f129c5d6c206fde389878fed4d26fec90923a21

@ -25,9 +25,9 @@ module.exports.getDetails = async (url) => {
} }
return route.continue(); return route.continue();
}); });
await tab.addInitScript({ path: "scraper/browser/scripts/bypass-paywalls-chrome/src/js/contentScript.js" }); await tab.addInitScript({ path: "scraper/headless/scripts/bypass-paywalls-chrome/src/js/contentScript.js" });
await tab.addInitScript({ path: "scraper/browser/scripts/cosmetic-filters.js" }); await tab.addInitScript({ path: "scraper/headless/scripts/cosmetic-filters.js" });
await tab.addInitScript({ path: "scraper/browser/scripts/fix-relative-links.js" }); await tab.addInitScript({ path: "scraper/headless/scripts/fix-relative-links.js" });
await tab.goto(url, { timeout: 60000, waitUntil: "domcontentloaded" }); await tab.goto(url, { timeout: 60000, waitUntil: "domcontentloaded" });
await tab.waitForTimeout(2000); await tab.waitForTimeout(2000);

@ -0,0 +1 @@
Subproject commit fff7f483db947e690977bfc80955a53329d3d349
Loading…
Cancel
Save