Add regex-based url exclusion support & refactor tests - geminispace.info

commit 153745eb823c0962489289d85171bbdc24310855
parent 7b5205c27c989e2f52eec57712de4a93de326db2
Author: Natalie Pendragon <natpen@natpen.net>
Date:   Tue, 17 Nov 2020 08:32:02 -0500

Add regex-based url exclusion support & refactor tests

This commit adds support for excluding URLs by regex, which is more
powerful than the prefix- and suffix-based exclusions GUS has so far
supported. There have been a number of cases, primarily involving
wiki-type sites, where it would be useful to match a URL by a pattern
that occurs in the middle of the URL, which is now possible. An
example of this is twinwiki's "_history" and "_revert" pages.

This commit refactors the existing test file to a more native pytest
style, from the previous unittest style. Additionally, it adds a new
set of tests for the URL exclusion functionality, covering both the
new regex-based exclusion functionality described above, as well as
the older style of prefix/suffix-based exclusion.

Diffstat:
M README.md  | 2 +-
M gus/build_index.py  | 10 +++-------
M gus/crawl.py  | 50 +++++++++++++++++++++++++++++---------------------
M tests/gus/lib/test_gemini.py  | 24 +++++++++++-------------
A tests/gus/test_crawl.py  | 39 +++++++++++++++++++++++++++++++++++++++

5 files changed, 83 insertions(+), 42 deletions(-)
diff --git a/README.md b/README.md
@@ -34,7 +34,7 @@ Now you'll have created `index.new` directory, rename it to `index`.
 
 ## Running test suite
 
-Run: "poetry run python -m pytest"
+Run: "poetry run pytest"
 
 
 ## Roadmap / TODOs
diff --git a/gus/build_index.py b/gus/build_index.py
@@ -7,6 +7,7 @@ from urllib.parse import uses_relative, uses_netloc
 from . import constants
 from gus.crawl import EXCLUDED_URL_PREFIXES
 from gus.lib.db_model import init_db, Page
+from gus.lib.gemini import GeminiResource
 from gus.lib.index_statistics import (
     compute_index_statistics,
     persist_statistics,
@@ -23,14 +24,9 @@ uses_netloc.append("gemini")
 
 
 def index_page(index, page, indexed_urls):
-    should_skip = False
-    for excluded_prefix in EXCLUDED_URL_PREFIXES:
-        if page.normalized_url.startswith(excluded_prefix):
-            should_skip = True
-            break
-    if should_skip:
+    if should_skip(GeminiResource(page.url)):
         logging.debug(
-            "URL prefix matches exclusion list, skipping: %s",
+            "URL is excluded, skipping: %s",
             strip_control_chars(page.url),
         )
         return False
diff --git a/gus/crawl.py b/gus/crawl.py
@@ -1,5 +1,6 @@
 import argparse
 import logging
+import re
 
 from datetime import datetime, timedelta
 import os
@@ -21,6 +22,11 @@ import gus.lib.logging
 uses_relative.append("gemini")
 uses_netloc.append("gemini")
 
+EXCLUDED_URL_PATTERN = re.compile(
+    r"^gemini://(\d{6}\.ch|almp\d{4}\.app|.*/_(revert|history)/).*",
+    flags=re.IGNORECASE
+)
+
 # These are checked against normalized_url, so they should be
 # prepended with the gemini:// protocol, be all lowercased, and
 # not have the port specified if it is 1965.
@@ -331,17 +337,28 @@ def index_content(resource, response):
     return page, is_different
 
 
+def should_skip(resource):
+    should_skip = False
+    for excluded_prefix in EXCLUDED_URL_PREFIXES:
+        if resource.normalized_url.startswith(excluded_prefix):
+            should_skip = True
+            break
+    for excluded_path in EXCLUDED_URL_PATHS:
+        if resource.urlsplit.path.lower().endswith(excluded_path):
+            should_skip = True
+            break
+    m = EXCLUDED_URL_PATTERN.match(resource.normalized_url)
+    if m:
+        should_skip = True
+    return should_skip
+
+
 def index_links(from_resource, contained_resources):
     from_page, created = Page.get_or_create(url=from_resource.indexable_url)
     Link.delete().where(Link.from_page == from_page).execute()
     data = []
     for cr in contained_resources:
-        should_skip = False
-        for excluded_prefix in EXCLUDED_URL_PREFIXES:
-            if cr.normalized_url.startswith(excluded_prefix):
-                should_skip = True
-                break
-        if should_skip:
+        if should_skip(cr):
             continue
         to_page = Page.get_or_none(url=cr.indexable_url)
         if not to_page:
@@ -392,21 +409,12 @@ def crawl_page(
             gus.lib.logging.strip_control_chars(url),
         )
         return
-    for excluded_prefix in EXCLUDED_URL_PREFIXES:
-        if gr.normalized_url.startswith(excluded_prefix):
-            logging.info(
-                "URL prefix matches exclusion list, skipping: %s",
-                gus.lib.logging.strip_control_chars(url),
-            )
-            return
-    for excluded_path in EXCLUDED_URL_PATHS:
-        if gr.urlsplit.path.lower().endswith(excluded_path):
-            logging.info(
-                "URL on exclusion list, skipping: %s",
-                gus.lib.logging.strip_control_chars(url),
-            )
-            return
-
+    if should_skip(gr):
+        logging.info(
+            "URL is excluded, skipping: %s",
+            gus.lib.logging.strip_control_chars(url),
+        )
+        return
     if should_check_if_expired:
         existing_page = Page.get_or_none(url=gr.indexable_url)
         if existing_page and existing_page.change_frequency is not None:
diff --git a/tests/gus/lib/test_gemini.py b/tests/gus/lib/test_gemini.py
@@ -1,30 +1,28 @@
-import unittest
 from gus.lib.gemini import GeminiResource
 
-
-class TestGeminiResource(unittest.TestCase):
+class TestGeminiResource:
     def test_extract_contained_resources(self):
         url = "gemini://host"
 
         # no content
         resources = GeminiResource(url).extract_contained_resources("")
-        self.assertEqual(resources, [])
+        assert resources == []
 
         # not a link
         resources = GeminiResource(url).extract_contained_resources(" => link")
-        self.assertEqual(resources, [])
+        assert resources == []
         resources = GeminiResource(url).extract_contained_resources(
             "```\n=> preformatted\n```"
         )
-        self.assertEqual(resources, [])
+        assert resources == []
 
         # some links
         resources = GeminiResource(url).extract_contained_resources(
             "=> link\ntext\n=> other"
         )
-        self.assertEqual(len(resources), 2)
-        self.assertEqual(resources[0].raw_url, "link")
-        self.assertEqual(resources[1].raw_url, "other")
+        assert len(resources) == 2
+        assert resources[0].raw_url == "link"
+        assert resources[1].raw_url == "other"
 
         resources = GeminiResource(url).extract_contained_resources(
             """
@@ -36,8 +34,8 @@ text
 => no link
 ```
 => other
-        """
+            """
         )
-        self.assertEqual(len(resources), 2)
-        self.assertEqual(resources[0].raw_url, "link")
-        self.assertEqual(resources[1].raw_url, "other")
+        assert len(resources) == 2
+        assert resources[0].raw_url == "link"
+        assert resources[1].raw_url == "other"
diff --git a/tests/gus/test_crawl.py b/tests/gus/test_crawl.py
@@ -0,0 +1,39 @@
+import pytest
+
+from gus.crawl import should_skip
+from gus.lib.gemini import GeminiResource
+
+class TestUrlExclusion:
+    @pytest.mark.parametrize("test_url,expected_result", [
+        ("gemini://gemini.circumlunar.space/favicon.ico", True),
+        ("gemini://gemini.circumlunar.space/rss.txt", True),
+    ])
+    def test_excluded_url_paths(self, test_url, expected_result):
+        resource = GeminiResource(test_url)
+        assert should_skip(resource) == expected_result
+
+
+    @pytest.mark.parametrize("test_url,expected_result", [
+        ("gemini://hannuhartikainen.fi/twinwiki/_revert/1594367314474", True),
+        ("gemini://hannuhartikainen.fi/twinwiki/1594367314474", False),
+        ("gemini://hannuhartikainen.fi/twinwiki/Sandbox/_history/1594037613712", True),
+        ("gemini://hannuhartikainen.fi/twinwiki", False),
+        ("gemini://123456.ch", True),
+        ("gemini://123456.ch/fnord", True),
+        ("gemini://almp1234.app", True),
+        ("gemini://almp1234.app/fnord", True),
+    ])
+    def test_excluded_url_pattern(self, test_url, expected_result):
+        resource = GeminiResource(test_url)
+        assert should_skip(resource) == expected_result
+
+
+    @pytest.mark.parametrize("test_url,expected_result", [
+        ("gemini://localhost", True),
+        ("gemini://example.org", True),
+        ("gus.guru", False),
+        ("gus.guru/search?turkey", True),
+    ])
+    def test_excluded_url_prefixes(self, test_url, expected_result):
+        resource = GeminiResource(test_url)
+        assert should_skip(resource) == expected_result

	geminispace.info gemini search engine
	git clone https://git.clttr.info/geminispace.info.git
	Log (Feed) \| Files \| Refs (Tags) \| README \| LICENSE

M	README.md	\|	2	+-
M	gus/build_index.py	\|	10	+++-------
M	gus/crawl.py	\|	50	+++++++++++++++++++++++++++++---------------------
M	tests/gus/lib/test_gemini.py	\|	24	+++++++++++-------------
A	tests/gus/test_crawl.py	\|	39	+++++++++++++++++++++++++++++++++++++++