[crawl] Add feature to seed incremental crawl with atom feeds - geminispace.info

commit c74caeb975ca120edf75f8d0a6bb865a1c144952
parent e4b2ef0192c7d75583f6a417c260585566b5125f
Author: Natalie Pendragon <natpen@natpen.net>
Date:   Wed,  8 Jul 2020 06:18:15 -0400

[crawl] Add feature to seed incremental crawl with atom feeds

Diffstat:
M gus/build_index.py  | 23 +++++++++++++++++++++--
M gus/crawl.py  | 128 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------
M poetry.lock  | 17 +++++++++++++++--
M pyproject.toml  | 1 +

4 files changed, 152 insertions(+), 17 deletions(-)
diff --git a/gus/build_index.py b/gus/build_index.py
@@ -151,7 +151,14 @@ def load_indexed_urls(index_dir):
     return indexed_urls
 
 
-def build_index(should_run_destructive=False):
+def invalidate_recent_results(invalidation_window):
+    recency_minimum = datetime.now() - timedelta(hours=invalidation_window)
+    pages = Page.select().where(Page.indexed_at.is_null(False), Page.indexed_at > recency_minimum)
+    for page in pages:
+        index_writer.delete_by_term("url_id", page.url, searcher=None)
+
+
+def build_index(should_run_destructive=False, invalidation_window=0):
     global index_dir
     index_dir = INDEX_DIR_NEW if should_run_destructive else INDEX_DIR_CURRENT
     global index_storage
@@ -165,6 +172,8 @@ def build_index(should_run_destructive=False):
     ix = index_storage.open_index()
     global index_writer
     index_writer = ix.writer()
+
+    invalidate_recent_results(invalidation_window)
     indexed_urls = [] if should_run_destructive else load_indexed_urls(INDEX_DIR_CURRENT)
 
     pages = Page.select().where(Page.indexed_at.is_null(False))
@@ -183,7 +192,7 @@ def build_index(should_run_destructive=False):
 
 def main():
     args = parse_args()
-    build_index(args.should_run_destructive)
+    build_index(args.should_run_destructive, args.invalidation_window)
 
 
 def parse_args():
@@ -196,6 +205,16 @@ def parse_args():
         default=False,
         help="create a fresh index and perform a full Geminispace crawl",
     )
+    parser.add_argument(
+        "--invalidation_window",
+        "-i",
+        dest="invalidation_window",
+        type=int,
+        default=0,
+        help="a recency window, in hours, for recently crawled pages that should be forcefully reindexed",
+    )
+    parser.add_argument('-o', '--output', dest='output', type=str,
+            default="index.gmi", help='output filename')
     args = parser.parse_args()
     return args
 
diff --git a/gus/crawl.py b/gus/crawl.py
@@ -8,6 +8,7 @@ import shutil
 import time
 from urllib.parse import urljoin, uses_relative, uses_netloc
 
+import feedparser
 import gusmobile as gemini
 from peewee import (
     BooleanField,
@@ -21,6 +22,7 @@ from peewee import (
     TextField,
 )
 
+from . import constants
 from gus.lib.db_model import init_db, Page, Link
 from gus.lib.gemini import GeminiResource, GeminiRobotFileParser
 
@@ -269,11 +271,16 @@ def get_robots_file(robot_host):
     return robot_file_map[robot_host]
 
 
-def crawl(gemini_resource):
+def crawl(gemini_resource, current_depth):
+    gr = gemini_resource
+    if max_crawl_depth >= 0 and current_depth > max_crawl_depth:
+        print("DEPTH SKIP   : %s" % gr.fetchable_url)
+        print("--------------------------")
+        return
     if not gemini_resource.is_valid:
+        print("INVALID RSCRC: %s" % gr.fetchable_url)
+        print("--------------------------")
         return
-
-    gr = gemini_resource
     for excluded_prefix in EXCLUDED_URL_PREFIXES:
         if gr.normalized_url.startswith(excluded_prefix):
             print("MANUAL EXCLUSION SKIP  : %s" % gr.fetchable_url)
@@ -343,7 +350,7 @@ def crawl(gemini_resource):
         # already seen this resource in visited_urls' normalized source of truth.
         visited_urls.pop()
         redirect_resource = GeminiResource(response.url, gr.normalized_url, gr.normalized_host)
-        crawl(redirect_resource)
+        crawl(redirect_resource, current_depth)
     elif response.status.startswith("1"):
         # input status
         print("URL          : %s" % response.url)
@@ -358,15 +365,15 @@ def crawl(gemini_resource):
         print("CONTENT TYPE : %s" % response.content_type)
         if response.content_type.startswith("text/"):
             index_content(gr, response)
-            if response.content_type == "text/gemini":
+            if response.content_type != "text/gemini":
+                print("--------------------------")
+            else:
                 print("Extracting contained resources...")
                 print("--------------------------")
                 contained_resources = gr.extract_contained_resources(response.content)
                 index_links(gr, contained_resources)
                 for resource in contained_resources:
-                    crawl(resource)
-            else:
-                print("--------------------------")
+                    crawl(resource, current_depth+1)
         else:
             index_binary(gr, response)
             print("--------------------------")
@@ -404,6 +411,89 @@ def load_seed_request_urls():
     return content
 
 
+def load_feed_urls(filename):
+    feeds = []
+    with open(filename, "r") as fp:
+        for line in fp:
+            line = line.strip()
+            if not line or line.startswith("#"):
+                continue
+            feeds.append(line)
+    return feeds
+
+
+def items_from_feed_string(feed_str):
+    feed_obj = feedparser.parse(feed_str)
+    feed = feed_obj.feed
+    return [(entry.updated_parsed, entry.link, entry.title, feed.title) for entry in feed_obj.entries]
+
+
+def resolve_feed_content_urls(feed_file=constants.FEED_FILE):
+    # Load feed URLs to query
+    feed_urls = load_feed_urls(feed_file)
+    N = len(feed_urls)
+
+    # Prepare to extract feed items
+    last_accessed = {}
+    skips = 0
+    items = []
+    while feed_urls:
+        # Get a feed URL to fetch
+        feed_url = feed_urls.pop()
+        feed_resource = GeminiResource(feed_url)
+
+        # Don't hammer servers
+        last = last_accessed.get(feed_resource.normalized_host, 0)
+        now = time.time()
+        interval = int(now - last)
+        if interval < 5:
+            print("Declining to hit {} again after only {} seconds".format(feed_resource.normalized_host, interval))
+            feed_urls.insert(0, feed_url)
+            skips += 1
+            if skips == len(feed_urls):
+                # We've hammered every server in the queue!  Sleep a bit...
+                print("Sleeping to give all servers a rest!")
+                time.sleep(5)
+            continue
+        skips = 0
+
+        # Good to go
+        print("Fetching ", feed_url)
+        try:
+            resp = feed_resource.fetch()
+        except:
+            print("Error on {}, skipping...".format(feed_url))
+            continue
+        if resp and resp.status == "20":
+            last_accessed[feed_resource.normalized_host] = time.time()
+            items.extend(items_from_feed_string(resp.content))
+    return [item[1] for item in items]
+
+
+def recrawl_feeds():
+    content_urls = resolve_feed_content_urls()
+    global index_dir
+    index_dir = INDEX_DIR_CURRENT
+    global db
+    db = init_db(f"{index_dir}/{constants.DB_FILENAME}")
+    global max_crawl_depth
+    max_crawl_depth = 0
+    global visited_urls
+    visited_urls = []
+    global robot_file_map
+    robot_file_map = unpickle_robot_file_map(INDEX_DIR_CURRENT)
+    global domain_hit_timings
+    domain_hit_timings = {}
+
+    seed_resources = [GeminiResource(url) for url in content_urls]
+    for resource in seed_resources:
+        crawl(resource, 0)
+
+    pickle_robot_file_map(robot_file_map, index_dir)
+    print(content_urls)
+    print("Finished!")
+
+
 def run_crawl(should_run_destructive=False, seed_urls=[]):
     # TODO: track failed domain/page attempts, and don't reattempt for 15mins
 
@@ -411,7 +501,7 @@ def run_crawl(should_run_destructive=False, seed_urls=[]):
     index_dir = INDEX_DIR_NEW if should_run_destructive else INDEX_DIR_CURRENT
     pathlib.Path(index_dir).mkdir(parents=True, exist_ok=True)
     global db
-    db = init_db(index_dir + "/gus.sqlite")
+    db = init_db(f"{index_dir}/{constants.DB_FILENAME}")
 
     global visited_urls
     visited_urls = [] if should_run_destructive else load_visited_urls(INDEX_DIR_CURRENT)
@@ -419,27 +509,31 @@ def run_crawl(should_run_destructive=False, seed_urls=[]):
     robot_file_map = {} if should_run_destructive else unpickle_robot_file_map(INDEX_DIR_CURRENT)
     global domain_hit_timings
     domain_hit_timings = {}
+    global max_crawl_depth
+    max_crawl_depth = -1
 
     seed_urls.extend(SEED_URLS)
     seed_resources = [GeminiResource(url) for url in seed_urls]
     for resource in seed_resources:
-        crawl(resource)
+        crawl(resource, 0)
 
     # after full crawl, crawl the seed requests as well in case there is
     # anything new
     seed_request_urls = load_seed_request_urls()
     seed_request_resources = [GeminiResource(url) for url in seed_request_urls]
     for resource in seed_request_resources:
-        crawl(resource)
+        crawl(resource, 0)
 
     pickle_robot_file_map(robot_file_map, index_dir)
-
     print("Finished!")
 
 
 def main():
     args = parse_args()
-    run_crawl(args.should_run_destructive, seed_urls=args.seed_urls)
+    if args.should_recrawl_feeds:
+        recrawl_feeds()
+    else:
+        run_crawl(args.should_run_destructive, seed_urls=args.seed_urls)
 
 
 def parse_args():
@@ -453,6 +547,14 @@ def parse_args():
         help="create a fresh index and perform a full Geminispace crawl",
     )
     parser.add_argument(
+        "--feeds",
+        "-f",
+        dest="should_recrawl_feeds",
+        action="store_true",
+        default=False,
+        help="recrawl known atom feeds",
+    )
+    parser.add_argument(
         "--seeds",
         "-s",
         metavar="URL",
diff --git a/poetry.lock b/poetry.lock
@@ -93,6 +93,14 @@ version = "4.4.2"
 
 [[package]]
 category = "main"
+description = "Universal feed parser, handles RSS 0.9x, RSS 1.0, RSS 2.0, CDF, Atom 0.3, and Atom 1.0 feeds"
+name = "feedparser"
+optional = false
+python-versions = "*"
+version = "5.2.1"
+
+[[package]]
+category = "main"
 description = "A simple library for requesting resources using the gemini protocol"
 name = "gusmobile"
 optional = false
@@ -317,7 +325,7 @@ wcwidth = "*"
 [[package]]
 category = "dev"
 description = "Run a subprocess in a pseudo terminal"
-marker = "python_version >= \"3.4\" and sys_platform != \"win32\" or sys_platform != \"win32\""
+marker = "python_version >= \"3.4\" and sys_platform != \"win32\" or sys_platform != \"win32\" or python_version >= \"3.4\" and sys_platform != \"win32\" and (python_version >= \"3.4\" and sys_platform != \"win32\" or sys_platform != \"win32\")"
 name = "ptyprocess"
 optional = false
 python-versions = "*"
@@ -451,7 +459,7 @@ docs = ["sphinx", "jaraco.packaging (>=3.2)", "rst.linker (>=1.9)"]
 testing = ["jaraco.itertools", "func-timeout"]
 
 [metadata]
-content-hash = "de8df694bd7d989863ed4249d3854b696f8f7702aef0c5280ca5d799d02512b9"
+content-hash = "aee51a99ddf80b23f5c568f5bde4fc004294011ac14b21868e0a9fde6b7c0319"
 python-versions = "^3.7"
 
 [metadata.files]
@@ -491,6 +499,11 @@ decorator = [
     {file = "decorator-4.4.2-py2.py3-none-any.whl", hash = "sha256:41fa54c2a0cc4ba648be4fd43cff00aedf5b9465c9bf18d64325bc225f08f760"},
     {file = "decorator-4.4.2.tar.gz", hash = "sha256:e3a62f0520172440ca0dcc823749319382e377f37f140a0b99ef45fecb84bfe7"},
 ]
+feedparser = [
+    {file = "feedparser-5.2.1.tar.bz2", hash = "sha256:ce875495c90ebd74b179855449040003a1beb40cd13d5f037a0654251e260b02"},
+    {file = "feedparser-5.2.1.tar.gz", hash = "sha256:bd030652c2d08532c034c27fcd7c85868e7fa3cb2b17f230a44a6bbc92519bf9"},
+    {file = "feedparser-5.2.1.zip", hash = "sha256:cd2485472e41471632ed3029d44033ee420ad0b57111db95c240c9160a85831c"},
+]
 gusmobile = []
 importlib-metadata = [
     {file = "importlib_metadata-1.6.1-py2.py3-none-any.whl", hash = "sha256:15ec6c0fd909e893e3a08b3a7c76ecb149122fb14b7efe1199ddd4c7c57ea958"},
diff --git a/pyproject.toml b/pyproject.toml
@@ -12,6 +12,7 @@ whoosh = "^2.7.4"
 jetforce = "^0.2.0"
 jinja2 = "^2.11.2"
 peewee = "^3.13.3"
+feedparser = "^5.2.1"
 
 [tool.poetry.dev-dependencies]
 black = "^19.10b0"

	geminispace.info gemini search engine
	git clone https://git.clttr.info/geminispace.info.git
	Log (Feed) \| Files \| Refs (Tags) \| README \| LICENSE

M	gus/build_index.py	\|	23	+++++++++++++++++++++--
M	gus/crawl.py	\|	128	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------
M	poetry.lock	\|	17	+++++++++++++++--
M	pyproject.toml	\|	1	+