geminispace.info

gemini search engine
git clone https://git.clttr.info/geminispace.info.git
Log (Feed) | Files | Refs (Tags) | README | LICENSE

commit b012dd5cc733f553a34c5eb5d0e6b2bf92e489c7
parent 75378967f722fbf1138d029612f68c684e8536d2
Author: Remco van 't Veer <remco@remworks.net>
Date:   Sun,  1 Nov 2020 15:39:26 +0100

Ignore link like lines in preformatted text blocks

Blocks of text between ``` lines should not be interpreted as markup.

Signed-off-by: Natalie Pendragon <natpen@natpen.net>

Diffstat:
MREADME.md | 6+++++-
Mgus/lib/gemini.py | 4+++-
Atests/gus/lib/test_gemini.py | 38++++++++++++++++++++++++++++++++++++++
3 files changed, 46 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md @@ -32,11 +32,15 @@ Now you'll have created `index.new` directory, rename it to `index`. 3. Restart frontend +## Running test suite + +Run: "poetry run python -m pytest" + + ## Roadmap / TODOs - TODO: improve crawl and build_index automation - TODO: get crawl to run on a schedule with systemd -- TODO: add some automated tests - TODO: add functionality to create a mock index - TODO: exclude raw-text blocks from indexed content - TODO: strip control characters from logged output like URLs diff --git a/gus/lib/gemini.py b/gus/lib/gemini.py @@ -349,7 +349,9 @@ class GeminiResource(): return self.contained_resources link_pattern = "^=>\s*(\S+)" - probable_urls = re.findall(link_pattern, content, re.MULTILINE) + preformat_pattern = r'^```.*?^```' + content_without_preformat = re.sub(preformat_pattern, '', content, flags=re.DOTALL | re.MULTILINE) + probable_urls = re.findall(link_pattern, content_without_preformat, re.MULTILINE) resources = [] for url in probable_urls: resource = GeminiResource( diff --git a/tests/gus/lib/test_gemini.py b/tests/gus/lib/test_gemini.py @@ -0,0 +1,38 @@ +import unittest +from gus.lib.gemini import GeminiResource + + +class TestGeminiResource(unittest.TestCase): + + def test_extract_contained_resources(self): + url = 'gemini://host' + + # no content + resources = GeminiResource(url).extract_contained_resources('') + self.assertEqual(resources, []) + + # not a link + resources = GeminiResource(url).extract_contained_resources(' => link') + self.assertEqual(resources, []) + resources = GeminiResource(url).extract_contained_resources('```\n=> preformatted\n```') + self.assertEqual(resources, []) + + # some links + resources = GeminiResource(url).extract_contained_resources('=> link\ntext\n=> other') + self.assertEqual(len(resources), 2) + self.assertEqual(resources[0].raw_url, 'link') + self.assertEqual(resources[1].raw_url, 'other') + + resources = GeminiResource(url).extract_contained_resources(""" +# title +text +=> link +text +``` preformatted +=> no link +``` +=> other + """) + self.assertEqual(len(resources), 2) + self.assertEqual(resources[0].raw_url, 'link') + self.assertEqual(resources[1].raw_url, 'other')