geminispace.info

gemini search engine
git clone https://git.clttr.info/geminispace.info.git
Log (Feed) | Files | Refs (Tags) | README | LICENSE

commit c74caeb975ca120edf75f8d0a6bb865a1c144952
parent e4b2ef0192c7d75583f6a417c260585566b5125f
Author: Natalie Pendragon <natpen@natpen.net>
Date:   Wed,  8 Jul 2020 06:18:15 -0400

[crawl] Add feature to seed incremental crawl with atom feeds

Diffstat:
Mgus/build_index.py | 23+++++++++++++++++++++--
Mgus/crawl.py | 128+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------
Mpoetry.lock | 17+++++++++++++++--
Mpyproject.toml | 1+
4 files changed, 152 insertions(+), 17 deletions(-)

diff --git a/gus/build_index.py b/gus/build_index.py @@ -151,7 +151,14 @@ def load_indexed_urls(index_dir): return indexed_urls -def build_index(should_run_destructive=False): +def invalidate_recent_results(invalidation_window): + recency_minimum = datetime.now() - timedelta(hours=invalidation_window) + pages = Page.select().where(Page.indexed_at.is_null(False), Page.indexed_at > recency_minimum) + for page in pages: + index_writer.delete_by_term("url_id", page.url, searcher=None) + + +def build_index(should_run_destructive=False, invalidation_window=0): global index_dir index_dir = INDEX_DIR_NEW if should_run_destructive else INDEX_DIR_CURRENT global index_storage @@ -165,6 +172,8 @@ def build_index(should_run_destructive=False): ix = index_storage.open_index() global index_writer index_writer = ix.writer() + + invalidate_recent_results(invalidation_window) indexed_urls = [] if should_run_destructive else load_indexed_urls(INDEX_DIR_CURRENT) pages = Page.select().where(Page.indexed_at.is_null(False)) @@ -183,7 +192,7 @@ def build_index(should_run_destructive=False): def main(): args = parse_args() - build_index(args.should_run_destructive) + build_index(args.should_run_destructive, args.invalidation_window) def parse_args(): @@ -196,6 +205,16 @@ def parse_args(): default=False, help="create a fresh index and perform a full Geminispace crawl", ) + parser.add_argument( + "--invalidation_window", + "-i", + dest="invalidation_window", + type=int, + default=0, + help="a recency window, in hours, for recently crawled pages that should be forcefully reindexed", + ) + parser.add_argument('-o', '--output', dest='output', type=str, + default="index.gmi", help='output filename') args = parser.parse_args() return args diff --git a/gus/crawl.py b/gus/crawl.py @@ -8,6 +8,7 @@ import shutil import time from urllib.parse import urljoin, uses_relative, uses_netloc +import feedparser import gusmobile as gemini from peewee import ( BooleanField, @@ -21,6 +22,7 @@ from peewee import ( TextField, ) +from . import constants from gus.lib.db_model import init_db, Page, Link from gus.lib.gemini import GeminiResource, GeminiRobotFileParser @@ -269,11 +271,16 @@ def get_robots_file(robot_host): return robot_file_map[robot_host] -def crawl(gemini_resource): +def crawl(gemini_resource, current_depth): + gr = gemini_resource + if max_crawl_depth >= 0 and current_depth > max_crawl_depth: + print("DEPTH SKIP : %s" % gr.fetchable_url) + print("--------------------------") + return if not gemini_resource.is_valid: + print("INVALID RSCRC: %s" % gr.fetchable_url) + print("--------------------------") return - - gr = gemini_resource for excluded_prefix in EXCLUDED_URL_PREFIXES: if gr.normalized_url.startswith(excluded_prefix): print("MANUAL EXCLUSION SKIP : %s" % gr.fetchable_url) @@ -343,7 +350,7 @@ def crawl(gemini_resource): # already seen this resource in visited_urls' normalized source of truth. visited_urls.pop() redirect_resource = GeminiResource(response.url, gr.normalized_url, gr.normalized_host) - crawl(redirect_resource) + crawl(redirect_resource, current_depth) elif response.status.startswith("1"): # input status print("URL : %s" % response.url) @@ -358,15 +365,15 @@ def crawl(gemini_resource): print("CONTENT TYPE : %s" % response.content_type) if response.content_type.startswith("text/"): index_content(gr, response) - if response.content_type == "text/gemini": + if response.content_type != "text/gemini": + print("--------------------------") + else: print("Extracting contained resources...") print("--------------------------") contained_resources = gr.extract_contained_resources(response.content) index_links(gr, contained_resources) for resource in contained_resources: - crawl(resource) - else: - print("--------------------------") + crawl(resource, current_depth+1) else: index_binary(gr, response) print("--------------------------") @@ -404,6 +411,89 @@ def load_seed_request_urls(): return content +def load_feed_urls(filename): + feeds = [] + with open(filename, "r") as fp: + for line in fp: + line = line.strip() + if not line or line.startswith("#"): + continue + feeds.append(line) + return feeds + + +def items_from_feed_string(feed_str): + feed_obj = feedparser.parse(feed_str) + feed = feed_obj.feed + return [(entry.updated_parsed, entry.link, entry.title, feed.title) for entry in feed_obj.entries] + + +def resolve_feed_content_urls(feed_file=constants.FEED_FILE): + # Load feed URLs to query + feed_urls = load_feed_urls(feed_file) + N = len(feed_urls) + + # Prepare to extract feed items + last_accessed = {} + skips = 0 + items = [] + while feed_urls: + # Get a feed URL to fetch + feed_url = feed_urls.pop() + feed_resource = GeminiResource(feed_url) + + # Don't hammer servers + last = last_accessed.get(feed_resource.normalized_host, 0) + now = time.time() + interval = int(now - last) + if interval < 5: + print("Declining to hit {} again after only {} seconds".format(feed_resource.normalized_host, interval)) + feed_urls.insert(0, feed_url) + skips += 1 + if skips == len(feed_urls): + # We've hammered every server in the queue! Sleep a bit... + print("Sleeping to give all servers a rest!") + time.sleep(5) + continue + skips = 0 + + # Good to go + print("Fetching ", feed_url) + try: + resp = feed_resource.fetch() + except: + print("Error on {}, skipping...".format(feed_url)) + continue + if resp and resp.status == "20": + last_accessed[feed_resource.normalized_host] = time.time() + items.extend(items_from_feed_string(resp.content)) + return [item[1] for item in items] + + +def recrawl_feeds(): + content_urls = resolve_feed_content_urls() + global index_dir + index_dir = INDEX_DIR_CURRENT + global db + db = init_db(f"{index_dir}/{constants.DB_FILENAME}") + global max_crawl_depth + max_crawl_depth = 0 + global visited_urls + visited_urls = [] + global robot_file_map + robot_file_map = unpickle_robot_file_map(INDEX_DIR_CURRENT) + global domain_hit_timings + domain_hit_timings = {} + + seed_resources = [GeminiResource(url) for url in content_urls] + for resource in seed_resources: + crawl(resource, 0) + + pickle_robot_file_map(robot_file_map, index_dir) + print(content_urls) + print("Finished!") + + def run_crawl(should_run_destructive=False, seed_urls=[]): # TODO: track failed domain/page attempts, and don't reattempt for 15mins @@ -411,7 +501,7 @@ def run_crawl(should_run_destructive=False, seed_urls=[]): index_dir = INDEX_DIR_NEW if should_run_destructive else INDEX_DIR_CURRENT pathlib.Path(index_dir).mkdir(parents=True, exist_ok=True) global db - db = init_db(index_dir + "/gus.sqlite") + db = init_db(f"{index_dir}/{constants.DB_FILENAME}") global visited_urls visited_urls = [] if should_run_destructive else load_visited_urls(INDEX_DIR_CURRENT) @@ -419,27 +509,31 @@ def run_crawl(should_run_destructive=False, seed_urls=[]): robot_file_map = {} if should_run_destructive else unpickle_robot_file_map(INDEX_DIR_CURRENT) global domain_hit_timings domain_hit_timings = {} + global max_crawl_depth + max_crawl_depth = -1 seed_urls.extend(SEED_URLS) seed_resources = [GeminiResource(url) for url in seed_urls] for resource in seed_resources: - crawl(resource) + crawl(resource, 0) # after full crawl, crawl the seed requests as well in case there is # anything new seed_request_urls = load_seed_request_urls() seed_request_resources = [GeminiResource(url) for url in seed_request_urls] for resource in seed_request_resources: - crawl(resource) + crawl(resource, 0) pickle_robot_file_map(robot_file_map, index_dir) - print("Finished!") def main(): args = parse_args() - run_crawl(args.should_run_destructive, seed_urls=args.seed_urls) + if args.should_recrawl_feeds: + recrawl_feeds() + else: + run_crawl(args.should_run_destructive, seed_urls=args.seed_urls) def parse_args(): @@ -453,6 +547,14 @@ def parse_args(): help="create a fresh index and perform a full Geminispace crawl", ) parser.add_argument( + "--feeds", + "-f", + dest="should_recrawl_feeds", + action="store_true", + default=False, + help="recrawl known atom feeds", + ) + parser.add_argument( "--seeds", "-s", metavar="URL", diff --git a/poetry.lock b/poetry.lock @@ -93,6 +93,14 @@ version = "4.4.2" [[package]] category = "main" +description = "Universal feed parser, handles RSS 0.9x, RSS 1.0, RSS 2.0, CDF, Atom 0.3, and Atom 1.0 feeds" +name = "feedparser" +optional = false +python-versions = "*" +version = "5.2.1" + +[[package]] +category = "main" description = "A simple library for requesting resources using the gemini protocol" name = "gusmobile" optional = false @@ -317,7 +325,7 @@ wcwidth = "*" [[package]] category = "dev" description = "Run a subprocess in a pseudo terminal" -marker = "python_version >= \"3.4\" and sys_platform != \"win32\" or sys_platform != \"win32\"" +marker = "python_version >= \"3.4\" and sys_platform != \"win32\" or sys_platform != \"win32\" or python_version >= \"3.4\" and sys_platform != \"win32\" and (python_version >= \"3.4\" and sys_platform != \"win32\" or sys_platform != \"win32\")" name = "ptyprocess" optional = false python-versions = "*" @@ -451,7 +459,7 @@ docs = ["sphinx", "jaraco.packaging (>=3.2)", "rst.linker (>=1.9)"] testing = ["jaraco.itertools", "func-timeout"] [metadata] -content-hash = "de8df694bd7d989863ed4249d3854b696f8f7702aef0c5280ca5d799d02512b9" +content-hash = "aee51a99ddf80b23f5c568f5bde4fc004294011ac14b21868e0a9fde6b7c0319" python-versions = "^3.7" [metadata.files] @@ -491,6 +499,11 @@ decorator = [ {file = "decorator-4.4.2-py2.py3-none-any.whl", hash = "sha256:41fa54c2a0cc4ba648be4fd43cff00aedf5b9465c9bf18d64325bc225f08f760"}, {file = "decorator-4.4.2.tar.gz", hash = "sha256:e3a62f0520172440ca0dcc823749319382e377f37f140a0b99ef45fecb84bfe7"}, ] +feedparser = [ + {file = "feedparser-5.2.1.tar.bz2", hash = "sha256:ce875495c90ebd74b179855449040003a1beb40cd13d5f037a0654251e260b02"}, + {file = "feedparser-5.2.1.tar.gz", hash = "sha256:bd030652c2d08532c034c27fcd7c85868e7fa3cb2b17f230a44a6bbc92519bf9"}, + {file = "feedparser-5.2.1.zip", hash = "sha256:cd2485472e41471632ed3029d44033ee420ad0b57111db95c240c9160a85831c"}, +] gusmobile = [] importlib-metadata = [ {file = "importlib_metadata-1.6.1-py2.py3-none-any.whl", hash = "sha256:15ec6c0fd909e893e3a08b3a7c76ecb149122fb14b7efe1199ddd4c7c57ea958"}, diff --git a/pyproject.toml b/pyproject.toml @@ -12,6 +12,7 @@ whoosh = "^2.7.4" jetforce = "^0.2.0" jinja2 = "^2.11.2" peewee = "^3.13.3" +feedparser = "^5.2.1" [tool.poetry.dev-dependencies] black = "^19.10b0"