geminispace.info

gemini search engine
git clone https://git.clttr.info/geminispace.info.git
Log (Feed) | Files | Refs (Tags) | README | LICENSE

commit 153745eb823c0962489289d85171bbdc24310855
parent 7b5205c27c989e2f52eec57712de4a93de326db2
Author: Natalie Pendragon <natpen@natpen.net>
Date:   Tue, 17 Nov 2020 08:32:02 -0500

Add regex-based url exclusion support & refactor tests

This commit adds support for excluding URLs by regex, which is more
powerful than the prefix- and suffix-based exclusions GUS has so far
supported. There have been a number of cases, primarily involving
wiki-type sites, where it would be useful to match a URL by a pattern
that occurs in the middle of the URL, which is now possible. An
example of this is twinwiki's "_history" and "_revert" pages.

This commit refactors the existing test file to a more native pytest
style, from the previous unittest style. Additionally, it adds a new
set of tests for the URL exclusion functionality, covering both the
new regex-based exclusion functionality described above, as well as
the older style of prefix/suffix-based exclusion.

Diffstat:
MREADME.md | 2+-
Mgus/build_index.py | 10+++-------
Mgus/crawl.py | 50+++++++++++++++++++++++++++++---------------------
Mtests/gus/lib/test_gemini.py | 24+++++++++++-------------
Atests/gus/test_crawl.py | 39+++++++++++++++++++++++++++++++++++++++
5 files changed, 83 insertions(+), 42 deletions(-)

diff --git a/README.md b/README.md @@ -34,7 +34,7 @@ Now you'll have created `index.new` directory, rename it to `index`. ## Running test suite -Run: "poetry run python -m pytest" +Run: "poetry run pytest" ## Roadmap / TODOs diff --git a/gus/build_index.py b/gus/build_index.py @@ -7,6 +7,7 @@ from urllib.parse import uses_relative, uses_netloc from . import constants from gus.crawl import EXCLUDED_URL_PREFIXES from gus.lib.db_model import init_db, Page +from gus.lib.gemini import GeminiResource from gus.lib.index_statistics import ( compute_index_statistics, persist_statistics, @@ -23,14 +24,9 @@ uses_netloc.append("gemini") def index_page(index, page, indexed_urls): - should_skip = False - for excluded_prefix in EXCLUDED_URL_PREFIXES: - if page.normalized_url.startswith(excluded_prefix): - should_skip = True - break - if should_skip: + if should_skip(GeminiResource(page.url)): logging.debug( - "URL prefix matches exclusion list, skipping: %s", + "URL is excluded, skipping: %s", strip_control_chars(page.url), ) return False diff --git a/gus/crawl.py b/gus/crawl.py @@ -1,5 +1,6 @@ import argparse import logging +import re from datetime import datetime, timedelta import os @@ -21,6 +22,11 @@ import gus.lib.logging uses_relative.append("gemini") uses_netloc.append("gemini") +EXCLUDED_URL_PATTERN = re.compile( + r"^gemini://(\d{6}\.ch|almp\d{4}\.app|.*/_(revert|history)/).*", + flags=re.IGNORECASE +) + # These are checked against normalized_url, so they should be # prepended with the gemini:// protocol, be all lowercased, and # not have the port specified if it is 1965. @@ -331,17 +337,28 @@ def index_content(resource, response): return page, is_different +def should_skip(resource): + should_skip = False + for excluded_prefix in EXCLUDED_URL_PREFIXES: + if resource.normalized_url.startswith(excluded_prefix): + should_skip = True + break + for excluded_path in EXCLUDED_URL_PATHS: + if resource.urlsplit.path.lower().endswith(excluded_path): + should_skip = True + break + m = EXCLUDED_URL_PATTERN.match(resource.normalized_url) + if m: + should_skip = True + return should_skip + + def index_links(from_resource, contained_resources): from_page, created = Page.get_or_create(url=from_resource.indexable_url) Link.delete().where(Link.from_page == from_page).execute() data = [] for cr in contained_resources: - should_skip = False - for excluded_prefix in EXCLUDED_URL_PREFIXES: - if cr.normalized_url.startswith(excluded_prefix): - should_skip = True - break - if should_skip: + if should_skip(cr): continue to_page = Page.get_or_none(url=cr.indexable_url) if not to_page: @@ -392,21 +409,12 @@ def crawl_page( gus.lib.logging.strip_control_chars(url), ) return - for excluded_prefix in EXCLUDED_URL_PREFIXES: - if gr.normalized_url.startswith(excluded_prefix): - logging.info( - "URL prefix matches exclusion list, skipping: %s", - gus.lib.logging.strip_control_chars(url), - ) - return - for excluded_path in EXCLUDED_URL_PATHS: - if gr.urlsplit.path.lower().endswith(excluded_path): - logging.info( - "URL on exclusion list, skipping: %s", - gus.lib.logging.strip_control_chars(url), - ) - return - + if should_skip(gr): + logging.info( + "URL is excluded, skipping: %s", + gus.lib.logging.strip_control_chars(url), + ) + return if should_check_if_expired: existing_page = Page.get_or_none(url=gr.indexable_url) if existing_page and existing_page.change_frequency is not None: diff --git a/tests/gus/lib/test_gemini.py b/tests/gus/lib/test_gemini.py @@ -1,30 +1,28 @@ -import unittest from gus.lib.gemini import GeminiResource - -class TestGeminiResource(unittest.TestCase): +class TestGeminiResource: def test_extract_contained_resources(self): url = "gemini://host" # no content resources = GeminiResource(url).extract_contained_resources("") - self.assertEqual(resources, []) + assert resources == [] # not a link resources = GeminiResource(url).extract_contained_resources(" => link") - self.assertEqual(resources, []) + assert resources == [] resources = GeminiResource(url).extract_contained_resources( "```\n=> preformatted\n```" ) - self.assertEqual(resources, []) + assert resources == [] # some links resources = GeminiResource(url).extract_contained_resources( "=> link\ntext\n=> other" ) - self.assertEqual(len(resources), 2) - self.assertEqual(resources[0].raw_url, "link") - self.assertEqual(resources[1].raw_url, "other") + assert len(resources) == 2 + assert resources[0].raw_url == "link" + assert resources[1].raw_url == "other" resources = GeminiResource(url).extract_contained_resources( """ @@ -36,8 +34,8 @@ text => no link ``` => other - """ + """ ) - self.assertEqual(len(resources), 2) - self.assertEqual(resources[0].raw_url, "link") - self.assertEqual(resources[1].raw_url, "other") + assert len(resources) == 2 + assert resources[0].raw_url == "link" + assert resources[1].raw_url == "other" diff --git a/tests/gus/test_crawl.py b/tests/gus/test_crawl.py @@ -0,0 +1,39 @@ +import pytest + +from gus.crawl import should_skip +from gus.lib.gemini import GeminiResource + +class TestUrlExclusion: + @pytest.mark.parametrize("test_url,expected_result", [ + ("gemini://gemini.circumlunar.space/favicon.ico", True), + ("gemini://gemini.circumlunar.space/rss.txt", True), + ]) + def test_excluded_url_paths(self, test_url, expected_result): + resource = GeminiResource(test_url) + assert should_skip(resource) == expected_result + + + @pytest.mark.parametrize("test_url,expected_result", [ + ("gemini://hannuhartikainen.fi/twinwiki/_revert/1594367314474", True), + ("gemini://hannuhartikainen.fi/twinwiki/1594367314474", False), + ("gemini://hannuhartikainen.fi/twinwiki/Sandbox/_history/1594037613712", True), + ("gemini://hannuhartikainen.fi/twinwiki", False), + ("gemini://123456.ch", True), + ("gemini://123456.ch/fnord", True), + ("gemini://almp1234.app", True), + ("gemini://almp1234.app/fnord", True), + ]) + def test_excluded_url_pattern(self, test_url, expected_result): + resource = GeminiResource(test_url) + assert should_skip(resource) == expected_result + + + @pytest.mark.parametrize("test_url,expected_result", [ + ("gemini://localhost", True), + ("gemini://example.org", True), + ("gus.guru", False), + ("gus.guru/search?turkey", True), + ]) + def test_excluded_url_prefixes(self, test_url, expected_result): + resource = GeminiResource(test_url) + assert should_skip(resource) == expected_result