commit c74caeb975ca120edf75f8d0a6bb865a1c144952
parent e4b2ef0192c7d75583f6a417c260585566b5125f
Author: Natalie Pendragon <natpen@natpen.net>
Date: Wed, 8 Jul 2020 06:18:15 -0400
[crawl] Add feature to seed incremental crawl with atom feeds
Diffstat:
4 files changed, 152 insertions(+), 17 deletions(-)
diff --git a/gus/build_index.py b/gus/build_index.py
@@ -151,7 +151,14 @@ def load_indexed_urls(index_dir):
return indexed_urls
-def build_index(should_run_destructive=False):
+def invalidate_recent_results(invalidation_window):
+ recency_minimum = datetime.now() - timedelta(hours=invalidation_window)
+ pages = Page.select().where(Page.indexed_at.is_null(False), Page.indexed_at > recency_minimum)
+ for page in pages:
+ index_writer.delete_by_term("url_id", page.url, searcher=None)
+
+
+def build_index(should_run_destructive=False, invalidation_window=0):
global index_dir
index_dir = INDEX_DIR_NEW if should_run_destructive else INDEX_DIR_CURRENT
global index_storage
@@ -165,6 +172,8 @@ def build_index(should_run_destructive=False):
ix = index_storage.open_index()
global index_writer
index_writer = ix.writer()
+
+ invalidate_recent_results(invalidation_window)
indexed_urls = [] if should_run_destructive else load_indexed_urls(INDEX_DIR_CURRENT)
pages = Page.select().where(Page.indexed_at.is_null(False))
@@ -183,7 +192,7 @@ def build_index(should_run_destructive=False):
def main():
args = parse_args()
- build_index(args.should_run_destructive)
+ build_index(args.should_run_destructive, args.invalidation_window)
def parse_args():
@@ -196,6 +205,16 @@ def parse_args():
default=False,
help="create a fresh index and perform a full Geminispace crawl",
)
+ parser.add_argument(
+ "--invalidation_window",
+ "-i",
+ dest="invalidation_window",
+ type=int,
+ default=0,
+ help="a recency window, in hours, for recently crawled pages that should be forcefully reindexed",
+ )
+ parser.add_argument('-o', '--output', dest='output', type=str,
+ default="index.gmi", help='output filename')
args = parser.parse_args()
return args
diff --git a/gus/crawl.py b/gus/crawl.py
@@ -8,6 +8,7 @@ import shutil
import time
from urllib.parse import urljoin, uses_relative, uses_netloc
+import feedparser
import gusmobile as gemini
from peewee import (
BooleanField,
@@ -21,6 +22,7 @@ from peewee import (
TextField,
)
+from . import constants
from gus.lib.db_model import init_db, Page, Link
from gus.lib.gemini import GeminiResource, GeminiRobotFileParser
@@ -269,11 +271,16 @@ def get_robots_file(robot_host):
return robot_file_map[robot_host]
-def crawl(gemini_resource):
+def crawl(gemini_resource, current_depth):
+ gr = gemini_resource
+ if max_crawl_depth >= 0 and current_depth > max_crawl_depth:
+ print("DEPTH SKIP : %s" % gr.fetchable_url)
+ print("--------------------------")
+ return
if not gemini_resource.is_valid:
+ print("INVALID RSCRC: %s" % gr.fetchable_url)
+ print("--------------------------")
return
-
- gr = gemini_resource
for excluded_prefix in EXCLUDED_URL_PREFIXES:
if gr.normalized_url.startswith(excluded_prefix):
print("MANUAL EXCLUSION SKIP : %s" % gr.fetchable_url)
@@ -343,7 +350,7 @@ def crawl(gemini_resource):
# already seen this resource in visited_urls' normalized source of truth.
visited_urls.pop()
redirect_resource = GeminiResource(response.url, gr.normalized_url, gr.normalized_host)
- crawl(redirect_resource)
+ crawl(redirect_resource, current_depth)
elif response.status.startswith("1"):
# input status
print("URL : %s" % response.url)
@@ -358,15 +365,15 @@ def crawl(gemini_resource):
print("CONTENT TYPE : %s" % response.content_type)
if response.content_type.startswith("text/"):
index_content(gr, response)
- if response.content_type == "text/gemini":
+ if response.content_type != "text/gemini":
+ print("--------------------------")
+ else:
print("Extracting contained resources...")
print("--------------------------")
contained_resources = gr.extract_contained_resources(response.content)
index_links(gr, contained_resources)
for resource in contained_resources:
- crawl(resource)
- else:
- print("--------------------------")
+ crawl(resource, current_depth+1)
else:
index_binary(gr, response)
print("--------------------------")
@@ -404,6 +411,89 @@ def load_seed_request_urls():
return content
+def load_feed_urls(filename):
+ feeds = []
+ with open(filename, "r") as fp:
+ for line in fp:
+ line = line.strip()
+ if not line or line.startswith("#"):
+ continue
+ feeds.append(line)
+ return feeds
+
+
+def items_from_feed_string(feed_str):
+ feed_obj = feedparser.parse(feed_str)
+ feed = feed_obj.feed
+ return [(entry.updated_parsed, entry.link, entry.title, feed.title) for entry in feed_obj.entries]
+
+
+def resolve_feed_content_urls(feed_file=constants.FEED_FILE):
+ # Load feed URLs to query
+ feed_urls = load_feed_urls(feed_file)
+ N = len(feed_urls)
+
+ # Prepare to extract feed items
+ last_accessed = {}
+ skips = 0
+ items = []
+ while feed_urls:
+ # Get a feed URL to fetch
+ feed_url = feed_urls.pop()
+ feed_resource = GeminiResource(feed_url)
+
+ # Don't hammer servers
+ last = last_accessed.get(feed_resource.normalized_host, 0)
+ now = time.time()
+ interval = int(now - last)
+ if interval < 5:
+ print("Declining to hit {} again after only {} seconds".format(feed_resource.normalized_host, interval))
+ feed_urls.insert(0, feed_url)
+ skips += 1
+ if skips == len(feed_urls):
+ # We've hammered every server in the queue! Sleep a bit...
+ print("Sleeping to give all servers a rest!")
+ time.sleep(5)
+ continue
+ skips = 0
+
+ # Good to go
+ print("Fetching ", feed_url)
+ try:
+ resp = feed_resource.fetch()
+ except:
+ print("Error on {}, skipping...".format(feed_url))
+ continue
+ if resp and resp.status == "20":
+ last_accessed[feed_resource.normalized_host] = time.time()
+ items.extend(items_from_feed_string(resp.content))
+ return [item[1] for item in items]
+
+
+def recrawl_feeds():
+ content_urls = resolve_feed_content_urls()
+ global index_dir
+ index_dir = INDEX_DIR_CURRENT
+ global db
+ db = init_db(f"{index_dir}/{constants.DB_FILENAME}")
+ global max_crawl_depth
+ max_crawl_depth = 0
+ global visited_urls
+ visited_urls = []
+ global robot_file_map
+ robot_file_map = unpickle_robot_file_map(INDEX_DIR_CURRENT)
+ global domain_hit_timings
+ domain_hit_timings = {}
+
+ seed_resources = [GeminiResource(url) for url in content_urls]
+ for resource in seed_resources:
+ crawl(resource, 0)
+
+ pickle_robot_file_map(robot_file_map, index_dir)
+ print(content_urls)
+ print("Finished!")
+
+
def run_crawl(should_run_destructive=False, seed_urls=[]):
# TODO: track failed domain/page attempts, and don't reattempt for 15mins
@@ -411,7 +501,7 @@ def run_crawl(should_run_destructive=False, seed_urls=[]):
index_dir = INDEX_DIR_NEW if should_run_destructive else INDEX_DIR_CURRENT
pathlib.Path(index_dir).mkdir(parents=True, exist_ok=True)
global db
- db = init_db(index_dir + "/gus.sqlite")
+ db = init_db(f"{index_dir}/{constants.DB_FILENAME}")
global visited_urls
visited_urls = [] if should_run_destructive else load_visited_urls(INDEX_DIR_CURRENT)
@@ -419,27 +509,31 @@ def run_crawl(should_run_destructive=False, seed_urls=[]):
robot_file_map = {} if should_run_destructive else unpickle_robot_file_map(INDEX_DIR_CURRENT)
global domain_hit_timings
domain_hit_timings = {}
+ global max_crawl_depth
+ max_crawl_depth = -1
seed_urls.extend(SEED_URLS)
seed_resources = [GeminiResource(url) for url in seed_urls]
for resource in seed_resources:
- crawl(resource)
+ crawl(resource, 0)
# after full crawl, crawl the seed requests as well in case there is
# anything new
seed_request_urls = load_seed_request_urls()
seed_request_resources = [GeminiResource(url) for url in seed_request_urls]
for resource in seed_request_resources:
- crawl(resource)
+ crawl(resource, 0)
pickle_robot_file_map(robot_file_map, index_dir)
-
print("Finished!")
def main():
args = parse_args()
- run_crawl(args.should_run_destructive, seed_urls=args.seed_urls)
+ if args.should_recrawl_feeds:
+ recrawl_feeds()
+ else:
+ run_crawl(args.should_run_destructive, seed_urls=args.seed_urls)
def parse_args():
@@ -453,6 +547,14 @@ def parse_args():
help="create a fresh index and perform a full Geminispace crawl",
)
parser.add_argument(
+ "--feeds",
+ "-f",
+ dest="should_recrawl_feeds",
+ action="store_true",
+ default=False,
+ help="recrawl known atom feeds",
+ )
+ parser.add_argument(
"--seeds",
"-s",
metavar="URL",
diff --git a/poetry.lock b/poetry.lock
@@ -93,6 +93,14 @@ version = "4.4.2"
[[package]]
category = "main"
+description = "Universal feed parser, handles RSS 0.9x, RSS 1.0, RSS 2.0, CDF, Atom 0.3, and Atom 1.0 feeds"
+name = "feedparser"
+optional = false
+python-versions = "*"
+version = "5.2.1"
+
+[[package]]
+category = "main"
description = "A simple library for requesting resources using the gemini protocol"
name = "gusmobile"
optional = false
@@ -317,7 +325,7 @@ wcwidth = "*"
[[package]]
category = "dev"
description = "Run a subprocess in a pseudo terminal"
-marker = "python_version >= \"3.4\" and sys_platform != \"win32\" or sys_platform != \"win32\""
+marker = "python_version >= \"3.4\" and sys_platform != \"win32\" or sys_platform != \"win32\" or python_version >= \"3.4\" and sys_platform != \"win32\" and (python_version >= \"3.4\" and sys_platform != \"win32\" or sys_platform != \"win32\")"
name = "ptyprocess"
optional = false
python-versions = "*"
@@ -451,7 +459,7 @@ docs = ["sphinx", "jaraco.packaging (>=3.2)", "rst.linker (>=1.9)"]
testing = ["jaraco.itertools", "func-timeout"]
[metadata]
-content-hash = "de8df694bd7d989863ed4249d3854b696f8f7702aef0c5280ca5d799d02512b9"
+content-hash = "aee51a99ddf80b23f5c568f5bde4fc004294011ac14b21868e0a9fde6b7c0319"
python-versions = "^3.7"
[metadata.files]
@@ -491,6 +499,11 @@ decorator = [
{file = "decorator-4.4.2-py2.py3-none-any.whl", hash = "sha256:41fa54c2a0cc4ba648be4fd43cff00aedf5b9465c9bf18d64325bc225f08f760"},
{file = "decorator-4.4.2.tar.gz", hash = "sha256:e3a62f0520172440ca0dcc823749319382e377f37f140a0b99ef45fecb84bfe7"},
]
+feedparser = [
+ {file = "feedparser-5.2.1.tar.bz2", hash = "sha256:ce875495c90ebd74b179855449040003a1beb40cd13d5f037a0654251e260b02"},
+ {file = "feedparser-5.2.1.tar.gz", hash = "sha256:bd030652c2d08532c034c27fcd7c85868e7fa3cb2b17f230a44a6bbc92519bf9"},
+ {file = "feedparser-5.2.1.zip", hash = "sha256:cd2485472e41471632ed3029d44033ee420ad0b57111db95c240c9160a85831c"},
+]
gusmobile = []
importlib-metadata = [
{file = "importlib_metadata-1.6.1-py2.py3-none-any.whl", hash = "sha256:15ec6c0fd909e893e3a08b3a7c76ecb149122fb14b7efe1199ddd4c7c57ea958"},
diff --git a/pyproject.toml b/pyproject.toml
@@ -12,6 +12,7 @@ whoosh = "^2.7.4"
jetforce = "^0.2.0"
jinja2 = "^2.11.2"
peewee = "^3.13.3"
+feedparser = "^5.2.1"
[tool.poetry.dev-dependencies]
black = "^19.10b0"