commit b012dd5cc733f553a34c5eb5d0e6b2bf92e489c7
parent 75378967f722fbf1138d029612f68c684e8536d2
Author: Remco van 't Veer <remco@remworks.net>
Date: Sun, 1 Nov 2020 15:39:26 +0100
Ignore link like lines in preformatted text blocks
Blocks of text between ``` lines should not be interpreted as markup.
Signed-off-by: Natalie Pendragon <natpen@natpen.net>
Diffstat:
3 files changed, 46 insertions(+), 2 deletions(-)
diff --git a/README.md b/README.md
@@ -32,11 +32,15 @@ Now you'll have created `index.new` directory, rename it to `index`.
3. Restart frontend
+## Running test suite
+
+Run: "poetry run python -m pytest"
+
+
## Roadmap / TODOs
- TODO: improve crawl and build_index automation
- TODO: get crawl to run on a schedule with systemd
-- TODO: add some automated tests
- TODO: add functionality to create a mock index
- TODO: exclude raw-text blocks from indexed content
- TODO: strip control characters from logged output like URLs
diff --git a/gus/lib/gemini.py b/gus/lib/gemini.py
@@ -349,7 +349,9 @@ class GeminiResource():
return self.contained_resources
link_pattern = "^=>\s*(\S+)"
- probable_urls = re.findall(link_pattern, content, re.MULTILINE)
+ preformat_pattern = r'^```.*?^```'
+ content_without_preformat = re.sub(preformat_pattern, '', content, flags=re.DOTALL | re.MULTILINE)
+ probable_urls = re.findall(link_pattern, content_without_preformat, re.MULTILINE)
resources = []
for url in probable_urls:
resource = GeminiResource(
diff --git a/tests/gus/lib/test_gemini.py b/tests/gus/lib/test_gemini.py
@@ -0,0 +1,38 @@
+import unittest
+from gus.lib.gemini import GeminiResource
+
+
+class TestGeminiResource(unittest.TestCase):
+
+ def test_extract_contained_resources(self):
+ url = 'gemini://host'
+
+ # no content
+ resources = GeminiResource(url).extract_contained_resources('')
+ self.assertEqual(resources, [])
+
+ # not a link
+ resources = GeminiResource(url).extract_contained_resources(' => link')
+ self.assertEqual(resources, [])
+ resources = GeminiResource(url).extract_contained_resources('```\n=> preformatted\n```')
+ self.assertEqual(resources, [])
+
+ # some links
+ resources = GeminiResource(url).extract_contained_resources('=> link\ntext\n=> other')
+ self.assertEqual(len(resources), 2)
+ self.assertEqual(resources[0].raw_url, 'link')
+ self.assertEqual(resources[1].raw_url, 'other')
+
+ resources = GeminiResource(url).extract_contained_resources("""
+# title
+text
+=> link
+text
+``` preformatted
+=> no link
+```
+=> other
+ """)
+ self.assertEqual(len(resources), 2)
+ self.assertEqual(resources[0].raw_url, 'link')
+ self.assertEqual(resources[1].raw_url, 'other')