renaming things.

master
Jason Schwarzenberger 4 years ago
parent 55d50a86d8
commit 3b885e4327
  1. 6
      .gitmodules
  2. 14
      apiserver/feed.py
  3. 8
      apiserver/scrapers/headless.py
  4. 4
      apiserver/scrapers/simple.py
  5. 2
      apiserver/settings.py.example
  6. 20
      readerserver/main.js
  7. 1
      readerserver/scraper/browser/scripts/bypass-paywalls-chrome
  8. 6
      readerserver/scraper/headless/_browser.js
  9. 0
      readerserver/scraper/headless/_comments.js
  10. 0
      readerserver/scraper/headless/index.js
  11. 1
      readerserver/scraper/headless/scripts/bypass-paywalls-chrome
  12. 0
      readerserver/scraper/headless/scripts/cosmetic-filters.js
  13. 0
      readerserver/scraper/headless/scripts/fix-relative-links.js

6
.gitmodules vendored

@ -1,3 +1,3 @@
[submodule "readerserver/scraper/browser/scripts/bypass-paywalls-chrome"]
path = readerserver/scraper/browser/scripts/bypass-paywalls-chrome
url = https://github.com/iamadamdev/bypass-paywalls-chrome.git
[submodule "readerserver/scraper/headless/scripts/bypass-paywalls-chrome"]
path = readerserver/scraper/headless/scripts/bypass-paywalls-chrome
url = https://github.com/iamadamdev/bypass-paywalls-chrome/

@ -12,7 +12,7 @@ import settings
from feeds import hackernews, reddit, tildes, substack, manual
from feeds.sitemap import Sitemap
from feeds.category import Category
from scrapers import outline, declutter, browser, local
from scrapers import outline, declutter, headless, simple
INVALID_DOMAINS = ['youtube.com', 'bloomberg.com', 'wsj.com']
@ -63,14 +63,14 @@ def get_list():
def get_article(url):
scrapers = {
'declutter': declutter,
'headless': headless,
'simple': simple,
'outline': outline,
'browser': browser,
'local': local,
'declutter': declutter,
}
available = settings.SCRAPERS or ['local']
if 'local' not in available:
available += ['local']
available = settings.SCRAPERS or ['headless', 'simple']
if 'simple' not in available:
available += ['simple']
for scraper in available:
if scraper not in scrapers.keys():

@ -4,13 +4,13 @@ logging.basicConfig(
level=logging.DEBUG)
import requests
READ_API = 'http://127.0.0.1:33843/browser/details'
READ_COMMENT__API = 'http://127.0.0.1:33843/browser/commentd'
READ_API = 'http://127.0.0.1:33843/headless/details'
READ_COMMENT__API = 'http://127.0.0.1:33843/headless/comments'
TIMEOUT = 60
def get_html(url):
logging.info(f"Reader Scraper: {url}")
logging.info(f"Headless Browser Scraper: {url}")
details = get_details(url)
if not details:
return ''
@ -25,7 +25,7 @@ def get_details(url):
except KeyboardInterrupt:
raise
except BaseException as e:
logging.error('Problem Scraping article: {}'.format(str(e)))
logging.error('Problem scraping article: {}'.format(str(e)))
return None
def get_comments(url):

@ -4,11 +4,11 @@ logging.basicConfig(
level=logging.DEBUG)
import requests
READ_API = 'http://127.0.0.1:33843/details'
READ_API = 'http://127.0.0.1:33843/simple/details'
TIMEOUT = 20
def get_html(url):
logging.info(f"Local Scraper: {url}")
logging.info(f"Simple Scraper: {url}")
details = get_details(url)
if not details:
return ''

@ -51,7 +51,7 @@ CATEGORY = {}
# ],
# }
SCRAPERS = ['browser', 'declutter', 'outline', 'local']
SCRAPERS = ['headless', 'outline', 'declutter', 'simple']
# Reddit account info
# leave blank if not using Reddit

@ -2,12 +2,18 @@ const port = 33843;
const express = require('express');
const app = express();
const simple = require('./scraper/simple');
const browser = require('./scraper/browser');
const headless = require('./scraper/headless');
app.use(express.urlencoded({ extended: true }));
app.get('/', (req, res) => {
const routes = ['/', '/details', '/browser', '/browser/details', '/browser/comments'];
const routes = [
'/simple',
'/simple/details',
'/headless',
'/headless/details',
'/headless/comments'
];
const html = routes.map(route => `
<form method="POST" action="${route}" accept-charset="UTF-8">
@ -19,11 +25,11 @@ app.get('/', (req, res) => {
</form>`).join('<hr />');
res.send(html);
});
app.post('/', simple.scrape);
app.post('/details', simple.details);
app.post('/browser', browser.scrape);
app.post('/browser/details', browser.details);
app.post('/browser/comments', browser.comments);
app.post('/simple/', simple.scrape);
app.post('/simple/details', simple.details);
app.post('/headless', headless.scrape);
app.post('/headless/details', headless.details);
app.post('/headless/comments', headless.comments);
app.listen(port, () => {
console.log(`Example app listening on port ${port}!`);

@ -1 +0,0 @@
Subproject commit 0f129c5d6c206fde389878fed4d26fec90923a21

@ -25,9 +25,9 @@ module.exports.getDetails = async (url) => {
}
return route.continue();
});
await tab.addInitScript({ path: "scraper/browser/scripts/bypass-paywalls-chrome/src/js/contentScript.js" });
await tab.addInitScript({ path: "scraper/browser/scripts/cosmetic-filters.js" });
await tab.addInitScript({ path: "scraper/browser/scripts/fix-relative-links.js" });
await tab.addInitScript({ path: "scraper/headless/scripts/bypass-paywalls-chrome/src/js/contentScript.js" });
await tab.addInitScript({ path: "scraper/headless/scripts/cosmetic-filters.js" });
await tab.addInitScript({ path: "scraper/headless/scripts/fix-relative-links.js" });
await tab.goto(url, { timeout: 60000, waitUntil: "domcontentloaded" });
await tab.waitForTimeout(2000);

@ -0,0 +1 @@
Subproject commit fff7f483db947e690977bfc80955a53329d3d349
Loading…
Cancel
Save