commit 153745eb823c0962489289d85171bbdc24310855
parent 7b5205c27c989e2f52eec57712de4a93de326db2
Author: Natalie Pendragon <natpen@natpen.net>
Date: Tue, 17 Nov 2020 08:32:02 -0500
Add regex-based url exclusion support & refactor tests
This commit adds support for excluding URLs by regex, which is more
powerful than the prefix- and suffix-based exclusions GUS has so far
supported. There have been a number of cases, primarily involving
wiki-type sites, where it would be useful to match a URL by a pattern
that occurs in the middle of the URL, which is now possible. An
example of this is twinwiki's "_history" and "_revert" pages.
This commit refactors the existing test file to a more native pytest
style, from the previous unittest style. Additionally, it adds a new
set of tests for the URL exclusion functionality, covering both the
new regex-based exclusion functionality described above, as well as
the older style of prefix/suffix-based exclusion.
Diffstat:
5 files changed, 83 insertions(+), 42 deletions(-)
diff --git a/README.md b/README.md
@@ -34,7 +34,7 @@ Now you'll have created `index.new` directory, rename it to `index`.
## Running test suite
-Run: "poetry run python -m pytest"
+Run: "poetry run pytest"
## Roadmap / TODOs
diff --git a/gus/build_index.py b/gus/build_index.py
@@ -7,6 +7,7 @@ from urllib.parse import uses_relative, uses_netloc
from . import constants
from gus.crawl import EXCLUDED_URL_PREFIXES
from gus.lib.db_model import init_db, Page
+from gus.lib.gemini import GeminiResource
from gus.lib.index_statistics import (
compute_index_statistics,
persist_statistics,
@@ -23,14 +24,9 @@ uses_netloc.append("gemini")
def index_page(index, page, indexed_urls):
- should_skip = False
- for excluded_prefix in EXCLUDED_URL_PREFIXES:
- if page.normalized_url.startswith(excluded_prefix):
- should_skip = True
- break
- if should_skip:
+ if should_skip(GeminiResource(page.url)):
logging.debug(
- "URL prefix matches exclusion list, skipping: %s",
+ "URL is excluded, skipping: %s",
strip_control_chars(page.url),
)
return False
diff --git a/gus/crawl.py b/gus/crawl.py
@@ -1,5 +1,6 @@
import argparse
import logging
+import re
from datetime import datetime, timedelta
import os
@@ -21,6 +22,11 @@ import gus.lib.logging
uses_relative.append("gemini")
uses_netloc.append("gemini")
+EXCLUDED_URL_PATTERN = re.compile(
+ r"^gemini://(\d{6}\.ch|almp\d{4}\.app|.*/_(revert|history)/).*",
+ flags=re.IGNORECASE
+)
+
# These are checked against normalized_url, so they should be
# prepended with the gemini:// protocol, be all lowercased, and
# not have the port specified if it is 1965.
@@ -331,17 +337,28 @@ def index_content(resource, response):
return page, is_different
+def should_skip(resource):
+ should_skip = False
+ for excluded_prefix in EXCLUDED_URL_PREFIXES:
+ if resource.normalized_url.startswith(excluded_prefix):
+ should_skip = True
+ break
+ for excluded_path in EXCLUDED_URL_PATHS:
+ if resource.urlsplit.path.lower().endswith(excluded_path):
+ should_skip = True
+ break
+ m = EXCLUDED_URL_PATTERN.match(resource.normalized_url)
+ if m:
+ should_skip = True
+ return should_skip
+
+
def index_links(from_resource, contained_resources):
from_page, created = Page.get_or_create(url=from_resource.indexable_url)
Link.delete().where(Link.from_page == from_page).execute()
data = []
for cr in contained_resources:
- should_skip = False
- for excluded_prefix in EXCLUDED_URL_PREFIXES:
- if cr.normalized_url.startswith(excluded_prefix):
- should_skip = True
- break
- if should_skip:
+ if should_skip(cr):
continue
to_page = Page.get_or_none(url=cr.indexable_url)
if not to_page:
@@ -392,21 +409,12 @@ def crawl_page(
gus.lib.logging.strip_control_chars(url),
)
return
- for excluded_prefix in EXCLUDED_URL_PREFIXES:
- if gr.normalized_url.startswith(excluded_prefix):
- logging.info(
- "URL prefix matches exclusion list, skipping: %s",
- gus.lib.logging.strip_control_chars(url),
- )
- return
- for excluded_path in EXCLUDED_URL_PATHS:
- if gr.urlsplit.path.lower().endswith(excluded_path):
- logging.info(
- "URL on exclusion list, skipping: %s",
- gus.lib.logging.strip_control_chars(url),
- )
- return
-
+ if should_skip(gr):
+ logging.info(
+ "URL is excluded, skipping: %s",
+ gus.lib.logging.strip_control_chars(url),
+ )
+ return
if should_check_if_expired:
existing_page = Page.get_or_none(url=gr.indexable_url)
if existing_page and existing_page.change_frequency is not None:
diff --git a/tests/gus/lib/test_gemini.py b/tests/gus/lib/test_gemini.py
@@ -1,30 +1,28 @@
-import unittest
from gus.lib.gemini import GeminiResource
-
-class TestGeminiResource(unittest.TestCase):
+class TestGeminiResource:
def test_extract_contained_resources(self):
url = "gemini://host"
# no content
resources = GeminiResource(url).extract_contained_resources("")
- self.assertEqual(resources, [])
+ assert resources == []
# not a link
resources = GeminiResource(url).extract_contained_resources(" => link")
- self.assertEqual(resources, [])
+ assert resources == []
resources = GeminiResource(url).extract_contained_resources(
"```\n=> preformatted\n```"
)
- self.assertEqual(resources, [])
+ assert resources == []
# some links
resources = GeminiResource(url).extract_contained_resources(
"=> link\ntext\n=> other"
)
- self.assertEqual(len(resources), 2)
- self.assertEqual(resources[0].raw_url, "link")
- self.assertEqual(resources[1].raw_url, "other")
+ assert len(resources) == 2
+ assert resources[0].raw_url == "link"
+ assert resources[1].raw_url == "other"
resources = GeminiResource(url).extract_contained_resources(
"""
@@ -36,8 +34,8 @@ text
=> no link
```
=> other
- """
+ """
)
- self.assertEqual(len(resources), 2)
- self.assertEqual(resources[0].raw_url, "link")
- self.assertEqual(resources[1].raw_url, "other")
+ assert len(resources) == 2
+ assert resources[0].raw_url == "link"
+ assert resources[1].raw_url == "other"
diff --git a/tests/gus/test_crawl.py b/tests/gus/test_crawl.py
@@ -0,0 +1,39 @@
+import pytest
+
+from gus.crawl import should_skip
+from gus.lib.gemini import GeminiResource
+
+class TestUrlExclusion:
+ @pytest.mark.parametrize("test_url,expected_result", [
+ ("gemini://gemini.circumlunar.space/favicon.ico", True),
+ ("gemini://gemini.circumlunar.space/rss.txt", True),
+ ])
+ def test_excluded_url_paths(self, test_url, expected_result):
+ resource = GeminiResource(test_url)
+ assert should_skip(resource) == expected_result
+
+
+ @pytest.mark.parametrize("test_url,expected_result", [
+ ("gemini://hannuhartikainen.fi/twinwiki/_revert/1594367314474", True),
+ ("gemini://hannuhartikainen.fi/twinwiki/1594367314474", False),
+ ("gemini://hannuhartikainen.fi/twinwiki/Sandbox/_history/1594037613712", True),
+ ("gemini://hannuhartikainen.fi/twinwiki", False),
+ ("gemini://123456.ch", True),
+ ("gemini://123456.ch/fnord", True),
+ ("gemini://almp1234.app", True),
+ ("gemini://almp1234.app/fnord", True),
+ ])
+ def test_excluded_url_pattern(self, test_url, expected_result):
+ resource = GeminiResource(test_url)
+ assert should_skip(resource) == expected_result
+
+
+ @pytest.mark.parametrize("test_url,expected_result", [
+ ("gemini://localhost", True),
+ ("gemini://example.org", True),
+ ("gus.guru", False),
+ ("gus.guru/search?turkey", True),
+ ])
+ def test_excluded_url_prefixes(self, test_url, expected_result):
+ resource = GeminiResource(test_url)
+ assert should_skip(resource) == expected_result