Improve it all - geminispace.info - gemini search engine

commit 220709fdd46669fb36fdc321dc82f95205fe0b2d
parent e537dcdb4f9686d26396870a0b263d72c4a04519
Author: Natalie Pendragon <natpen@natpen.net>
Date:   Sat, 14 Mar 2020 22:50:06 -0400

Improve it all

Diffstat:
M README.md  | 22 ----------------------
M gus/crawl.py  | 94 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------------------
M gus/serve.py  | 13 +++++++------
M poetry.lock  | 58 +++++++++++++++++++++++++++++-----------------------------

4 files changed, 106 insertions(+), 81 deletions(-)
diff --git a/README.md b/README.md
@@ -30,19 +30,6 @@ as this guide to [mailing list etiquette](https://man.sr.ht/lists.sr.ht/etiquett
 - **general code cleanup**: most notably crawl.py. There are a lot
   of hacks in there that I put in for expediency, but haven't
   taken the time to address.
-- **improve the indexing**: currently, the url is prepended to
-  the page content, and everything is simply indexed with the
-  default indexer. I think a better solution would be to have
-  urls indexed with a url-specific indexer that doesn't do
-  things like, e.g., porter-stemming, which I assume the
-  default indexer is doing.
-- **extend the index to handle binary links in Geminispace**:
-  currently, there's a hack in the code to simply skip
-  anything that looks like a binary link. I think with the
-  above improvement to how indexing works, they could be
-  made very effectively searchable. Also in this vein,
-  binary links should be identified via their mime types
-  probably, instead of the suffix hack used now.
 - **add tests**: there aren't any yet!
 - **add functionality to create a mock index**: this would
   be useful for local hacking on serve.py, so one does
@@ -55,13 +42,4 @@ as this guide to [mailing list etiquette](https://man.sr.ht/lists.sr.ht/etiquett
   solution will become increasingly unappealing as the amount
   of content, and thus amount of search hits, in Geminispace
   grows).
-- **extend the index to handle query links in Geminispace**:
-  currently, there's a hack in the code to simply skip
-  query links based on the response status code. These should
-  be indexed, probably with their urls and their query prompt
-  texts as the indexed content.
-- **allow seedlist additions from the site itself**: this
-  would allow anyone to ensure their content gets crawled,
-  even if there is currently no path from the current
-  seedlist to their content.
 - **track freshness of content**
diff --git a/gus/crawl.py b/gus/crawl.py
@@ -2,7 +2,7 @@ import pathlib
 import re
 import shutil
 from urllib import robotparser
-from urllib.parse import urlparse, urlunparse
+from urllib.parse import unquote, urljoin, urlparse, urlunparse, uses_relative, uses_netloc
 
 import gusmobile as gemini
 from whoosh.analysis import FancyAnalyzer
@@ -11,6 +11,11 @@ from whoosh.index import create_in
 
 from gus.whoosh_extensions import UrlAnalyzer
 
+# hack(natpen): the built-in methods in urllib need to know the
+# Gemini protocol exists
+uses_relative.append("gemini")
+uses_netloc.append("gemini")
+
 INDEX_DIR = "index"
 
 SEED_URLS = [
@@ -29,6 +34,7 @@ SEED_URLS = [
     "gemini://dump.royniang.com",
     "gemini://konpeito.media",
     "gemini://gemini.68kmentat.com",
+    # "gemini://envs.net",
 ]
 
 
@@ -48,45 +54,74 @@ def create_index(index_dir):
             analyzer=FancyAnalyzer(),
             spelling=True,
         ),
+        prompt=TEXT(
+            analyzer=FancyAnalyzer(),
+            stored=True,
+        ),
     )
     index = create_in("index", schema)
     index_writer = index.writer()
     return index_writer
 
 
-def clean_links(links):
+def clean_links(links, current_url):
     clean_links = []
     for link in links:
         clean_link = link
         u = urlparse(link)
-        if u.scheme is not None and u.scheme != "gemini":
+        if u.scheme != '' and u.scheme != "gemini":
             continue
-        if u.port is None:
-            clean_link = clean_link.replace(u.hostname, u.hostname+":1965")
+        if u.netloc == '':
+            # relative link
+            clean_link = urljoin(current_url, clean_link)
+            u = urlparse(clean_link)
+        if u.port == 1965:
+            clean_link = clean_link.replace(u.hostname+":1965", u.hostname, 1)
         if u.scheme is None:
-            clean_link = clean_link.replace(u.hostname, "gemini://"+u.hostname)
+            clean_link = clean_link.replace(u.hostname, "gemini://"+u.hostname, 1)
         clean_links.append(clean_link)
     return clean_links
 
 
 def normalize_gemini_url(url):
+    if "%" in url:
+        url = unquote(url)
     u = urlparse(url.lower().strip().rstrip('/'), 'gemini')
+    if u.hostname is None:
+        return None
     url_normalized = urlunparse(u)
-    if u.port is None:
-        url_normalized = url_normalized.replace(u.hostname, u.hostname+":1965")
+    if u.port == 1965:
+        url_normalized = url_normalized.replace(u.hostname+":1965", u.hostname, 1)
     if u.scheme is None:
-        url_normalized = url_normalized.replace(u.hostname, "gemini://"+u.hostname)
+        url_normalized = url_normalized.replace(u.hostname, "gemini://"+u.hostname, 1)
     return url_normalized
 
-def extract_gemini_links(content):
+def extract_gemini_links(content, current_url):
     link_pattern = "=>\s(\S+)"
     links = re.findall(link_pattern, content)
-    gemini_links = clean_links(links)
+    gemini_links = clean_links(links, current_url)
     return gemini_links
 
 
+def index_binary(response):
+    print("INDEXING BINARY...")
+    index_writer.add_document(
+        url=response.url,
+        content_type=response.content_type,
+    )
+
+
+def index_prompt(response):
+    print("INDEXING PROMPT...")
+    index_writer.add_document(
+        url=response.url,
+        content_type="input",
+        prompt=response.prompt,
+    )
+
+
 def index_content(response):
-    print("INDEXING...")
+    print("INDEXING CONTENT...")
     index_writer.add_document(
         url=response.url,
         content_type=response.content_type,
@@ -113,12 +148,13 @@ def crawl_url(url):
     u = urlparse(url, 'gemini')
     url = urlunparse(u)
     path = u.path.lower().rstrip().rstrip('/')
-    if path.endswith(".mp3") or path.endswith(".png") or path.endswith(".jpg") or path.endswith(".jpeg") or path.endswith(".zip"):
-        print("BINARY SKIP  : %s" % url)
-        print("--------------------------")
-        return
     normalized_url = normalize_gemini_url(url)
-    if normalized_url.startswith("gemini://example.org"):
+    if normalized_url is None or \
+       normalized_url.startswith("gemini://example.org") or \
+       normalized_url.startswith("gemini://example.com") or \
+       normalized_url.startswith("gemini://gemini.conman.org/test"):
+        print("MANUAL EXCLUSION SKIP  : %s" % url)
+        print("--------------------------")
         return
     robots_file = get_robots_file(normalized_url)
     if robots_file is not None:
@@ -127,7 +163,6 @@ def crawl_url(url):
             print("ROBOTS SKIP  : %s" % url)
             print("--------------------------")
             return
-
     if normalized_url in visited_urls:
         print("ALREADY SEEN : %s" % url)
         print("--------------------------")
@@ -142,18 +177,29 @@ def crawl_url(url):
     elif r.status.startswith("3"):
         # redirect status
         print("REDIRECT     : %s -> %s" % (url, r.url))
+        visited_urls.pop()
         crawl_url(r.url)
+    elif r.status.startswith("1"):
+        # input status
+        print("URL          : %s" % r.url)
+        print("STATUS       : %s" % r.status)
+        print("PROMPT       : %s" % r.prompt)
+        index_prompt(r)
+        print("--------------------------")
     elif r.status.startswith("2"):
         # success status
         print("URL          : %s" % r.url)
         print("STATUS       : %s" % r.status)
-        print("STATUS META  : %s" % r.status_meta)
         print("CONTENT TYPE : %s" % r.content_type)
-        index_content(r)
-        print("--------------------------")
-        gemini_links = extract_gemini_links(r.content)
-        for link in gemini_links:
-            crawl_url(link)
+        if r.content_type.startswith("text/"):
+            index_content(r)
+            print("--------------------------")
+            gemini_links = extract_gemini_links(r.content, r.url)
+            for link in gemini_links:
+                crawl_url(link)
+        else:
+            index_binary(r)
+            print("--------------------------")
     else:
         # input, error, etc (all other statuses)
         print("UNHANDLED    : %s" % url)
diff --git a/gus/serve.py b/gus/serve.py
@@ -27,7 +27,7 @@ def _render_header():
 def _render_footer():
     return [
         "",
-        "=> /add-seed See any missing results? Add a gemini URL to the index here."
+        "=> /add-seed See any missing results? Let GUS know your gemini URL exists."
     ]
 
 
@@ -66,7 +66,7 @@ def index(request):
 
 
 def _search_index(query):
-    query = MultifieldParser(["content", "url"], ix.schema).parse(query)
+    query = MultifieldParser(["content", "url", "prompt"], ix.schema).parse(query)
     results = searcher.search(query)
     return (
         len(results),
@@ -98,17 +98,18 @@ def _render_results(results):
     for i, result in enumerate(results):
         if i > 0:
             data.append("")
+        prompt_suffix = ""
+        if result[2] == "input":
+            prompt_suffix = ": {}".format(result[4])
         data.append("=> {}".format(result[1]))
-        data.append("{} (score: {:.2f})".format(result[2], result[0]))
+        data.append("{}{} (score: {:.2f})".format(result[2], prompt_suffix, result[0]))
     return data
 
 
 def _render_results_header(query, num_results):
     return [
         "",
-        "| \"{}\" - {} hits".format(query, num_results),
-        # "| {} hits".format(num_results),
-        # "===========================",
+        "> \"{}\" - {} hits".format(query, num_results),
         ""
     ]
 
diff --git a/poetry.lock b/poetry.lock
@@ -71,8 +71,8 @@ category = "dev"
 description = "Composable command line interface toolkit"
 name = "click"
 optional = false
-python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
-version = "7.0"
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
+version = "7.1.1"
 
 [[package]]
 category = "dev"
@@ -89,7 +89,7 @@ description = "Decorators for Humans"
 name = "decorator"
 optional = false
 python-versions = ">=2.6, !=3.0.*, !=3.1.*"
-version = "4.4.1"
+version = "4.4.2"
 
 [[package]]
 category = "main"
@@ -100,7 +100,7 @@ python-versions = "*"
 version = "0.1.0"
 
 [package.source]
-reference = "72e639c07d06c48c5b545d238e35e406c1aece89"
+reference = "123c73b4e06c89781543dfcba55581d6a3931129"
 type = "git"
 url = "https://git.sr.ht/~natpen/gusmobile"
 [[package]]
@@ -125,7 +125,7 @@ description = "IPython: Productive Interactive Computing"
 name = "ipython"
 optional = false
 python-versions = ">=3.6"
-version = "7.12.0"
+version = "7.13.0"
 
 [package.dependencies]
 appnope = "*"
@@ -141,7 +141,7 @@ setuptools = ">=18.5"
 traitlets = ">=4.2"
 
 [package.extras]
-all = ["ipyparallel", "requests", "notebook", "qtconsole", "ipywidgets", "pygments", "nbconvert", "testpath", "Sphinx (>=1.3)", "nbformat", "numpy (>=1.14)", "ipykernel", "nose (>=0.10.1)"]
+all = ["numpy (>=1.14)", "testpath", "notebook", "nose (>=0.10.1)", "nbconvert", "requests", "ipywidgets", "qtconsole", "ipyparallel", "Sphinx (>=1.3)", "pygments", "nbformat", "ipykernel"]
 doc = ["Sphinx (>=1.3)"]
 kernel = ["ipykernel"]
 nbconvert = ["nbconvert"]
@@ -196,7 +196,7 @@ description = "Core utilities for Python packages"
 name = "packaging"
 optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
-version = "20.1"
+version = "20.3"
 
 [package.dependencies]
 pyparsing = ">=2.0.2"
@@ -262,8 +262,8 @@ category = "dev"
 description = "Library for building powerful interactive command lines in Python"
 name = "prompt-toolkit"
 optional = false
-python-versions = ">=3.6"
-version = "3.0.3"
+python-versions = ">=3.6.1"
+version = "3.0.4"
 
 [package.dependencies]
 wcwidth = "*"
@@ -290,8 +290,8 @@ category = "dev"
 description = "Pygments is a syntax highlighting package written in Python."
 name = "pygments"
 optional = false
-python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
-version = "2.5.2"
+python-versions = ">=3.5"
+version = "2.6.1"
 
 [[package]]
 category = "dev"
@@ -307,7 +307,7 @@ description = "pytest: simple powerful testing with Python"
 name = "pytest"
 optional = false
 python-versions = ">=3.5"
-version = "5.3.5"
+version = "5.4.1"
 
 [package.dependencies]
 atomicwrites = ">=1.0"
@@ -398,7 +398,7 @@ marker = "python_version < \"3.8\""
 name = "zipp"
 optional = false
 python-versions = ">=3.6"
-version = "3.0.0"
+version = "3.1.0"
 
 [package.extras]
 docs = ["sphinx", "jaraco.packaging (>=3.2)", "rst.linker (>=1.9)"]
@@ -434,16 +434,16 @@ black = [
     {file = "black-19.10b0.tar.gz", hash = "sha256:c2edb73a08e9e0e6f65a0e6af18b059b8b1cdd5bef997d7a0b181df93dc81539"},
 ]
 click = [
-    {file = "Click-7.0-py2.py3-none-any.whl", hash = "sha256:2335065e6395b9e67ca716de5f7526736bfa6ceead690adf616d925bdc622b13"},
-    {file = "Click-7.0.tar.gz", hash = "sha256:5b94b49521f6456670fdb30cd82a4eca9412788a93fa6dd6df72c94d5a8ff2d7"},
+    {file = "click-7.1.1-py2.py3-none-any.whl", hash = "sha256:e345d143d80bf5ee7534056164e5e112ea5e22716bbb1ce727941f4c8b471b9a"},
+    {file = "click-7.1.1.tar.gz", hash = "sha256:8a18b4ea89d8820c5d0c7da8a64b2c324b4dabb695804dbfea19b9be9d88c0cc"},
 ]
 colorama = [
     {file = "colorama-0.4.3-py2.py3-none-any.whl", hash = "sha256:7d73d2a99753107a36ac6b455ee49046802e59d9d076ef8e47b61499fa29afff"},
     {file = "colorama-0.4.3.tar.gz", hash = "sha256:e96da0d330793e2cb9485e9ddfd918d456036c7149416295932478192f4436a1"},
 ]
 decorator = [
-    {file = "decorator-4.4.1-py2.py3-none-any.whl", hash = "sha256:5d19b92a3c8f7f101c8dd86afd86b0f061a8ce4540ab8cd401fa2542756bce6d"},
-    {file = "decorator-4.4.1.tar.gz", hash = "sha256:54c38050039232e1db4ad7375cfce6748d7b41c29e95a081c8a6d2c30364a2ce"},
+    {file = "decorator-4.4.2-py2.py3-none-any.whl", hash = "sha256:41fa54c2a0cc4ba648be4fd43cff00aedf5b9465c9bf18d64325bc225f08f760"},
+    {file = "decorator-4.4.2.tar.gz", hash = "sha256:e3a62f0520172440ca0dcc823749319382e377f37f140a0b99ef45fecb84bfe7"},
 ]
 gusmobile = []
 importlib-metadata = [
@@ -451,8 +451,8 @@ importlib-metadata = [
     {file = "importlib_metadata-1.5.0.tar.gz", hash = "sha256:06f5b3a99029c7134207dd882428a66992a9de2bef7c2b699b5641f9886c3302"},
 ]
 ipython = [
-    {file = "ipython-7.12.0-py3-none-any.whl", hash = "sha256:f6689108b1734501d3b59c84427259fd5ac5141afe2e846cfa8598eb811886c9"},
-    {file = "ipython-7.12.0.tar.gz", hash = "sha256:d9459e7237e2e5858738ff9c3e26504b79899b58a6d49e574d352493d80684c6"},
+    {file = "ipython-7.13.0-py3-none-any.whl", hash = "sha256:eb8d075de37f678424527b5ef6ea23f7b80240ca031c2dd6de5879d687a65333"},
+    {file = "ipython-7.13.0.tar.gz", hash = "sha256:ca478e52ae1f88da0102360e57e528b92f3ae4316aabac80a2cd7f7ab2efb48a"},
 ]
 ipython-genutils = [
     {file = "ipython_genutils-0.2.0-py2.py3-none-any.whl", hash = "sha256:72dd37233799e619666c9f639a9da83c34013a73e8bbc79a7a6348d93c61fab8"},
@@ -471,8 +471,8 @@ more-itertools = [
     {file = "more_itertools-8.2.0-py3-none-any.whl", hash = "sha256:5dd8bcf33e5f9513ffa06d5ad33d78f31e1931ac9a18f33d37e77a180d393a7c"},
 ]
 packaging = [
-    {file = "packaging-20.1-py2.py3-none-any.whl", hash = "sha256:170748228214b70b672c581a3dd610ee51f733018650740e98c7df862a583f73"},
-    {file = "packaging-20.1.tar.gz", hash = "sha256:e665345f9eef0c621aa0bf2f8d78cf6d21904eef16a93f020240b704a57f1334"},
+    {file = "packaging-20.3-py2.py3-none-any.whl", hash = "sha256:82f77b9bee21c1bafbf35a84905d604d5d1223801d639cf3ed140bd651c08752"},
+    {file = "packaging-20.3.tar.gz", hash = "sha256:3c292b474fda1671ec57d46d739d072bfd495a4f51ad01a055121d81e952b7a3"},
 ]
 parso = [
     {file = "parso-0.6.2-py2.py3-none-any.whl", hash = "sha256:8515fc12cfca6ee3aa59138741fc5624d62340c97e401c74875769948d4f2995"},
@@ -495,8 +495,8 @@ pluggy = [
     {file = "pluggy-0.13.1.tar.gz", hash = "sha256:15b2acde666561e1298d71b523007ed7364de07029219b604cf808bfa1c765b0"},
 ]
 prompt-toolkit = [
-    {file = "prompt_toolkit-3.0.3-py3-none-any.whl", hash = "sha256:c93e53af97f630f12f5f62a3274e79527936ed466f038953dfa379d4941f651a"},
-    {file = "prompt_toolkit-3.0.3.tar.gz", hash = "sha256:a402e9bf468b63314e37460b68ba68243d55b2f8c4d0192f85a019af3945050e"},
+    {file = "prompt_toolkit-3.0.4-py3-none-any.whl", hash = "sha256:859e1b205b6cf6a51fa57fa34202e45365cf58f8338f0ee9f4e84a4165b37d5b"},
+    {file = "prompt_toolkit-3.0.4.tar.gz", hash = "sha256:ebe6b1b08c888b84c50d7f93dee21a09af39860144ff6130aadbd61ae8d29783"},
 ]
 ptyprocess = [
     {file = "ptyprocess-0.6.0-py2.py3-none-any.whl", hash = "sha256:d7cc528d76e76342423ca640335bd3633420dc1366f258cb31d05e865ef5ca1f"},
@@ -507,16 +507,16 @@ py = [
     {file = "py-1.8.1.tar.gz", hash = "sha256:5e27081401262157467ad6e7f851b7aa402c5852dbcb3dae06768434de5752aa"},
 ]
 pygments = [
-    {file = "Pygments-2.5.2-py2.py3-none-any.whl", hash = "sha256:2a3fe295e54a20164a9df49c75fa58526d3be48e14aceba6d6b1e8ac0bfd6f1b"},
-    {file = "Pygments-2.5.2.tar.gz", hash = "sha256:98c8aa5a9f778fcd1026a17361ddaf7330d1b7c62ae97c3bb0ae73e0b9b6b0fe"},
+    {file = "Pygments-2.6.1-py3-none-any.whl", hash = "sha256:ff7a40b4860b727ab48fad6360eb351cc1b33cbf9b15a0f689ca5353e9463324"},
+    {file = "Pygments-2.6.1.tar.gz", hash = "sha256:647344a061c249a3b74e230c739f434d7ea4d8b1d5f3721bc0f3558049b38f44"},
 ]
 pyparsing = [
     {file = "pyparsing-2.4.6-py2.py3-none-any.whl", hash = "sha256:c342dccb5250c08d45fd6f8b4a559613ca603b57498511740e65cd11a2e7dcec"},
     {file = "pyparsing-2.4.6.tar.gz", hash = "sha256:4c830582a84fb022400b85429791bc551f1f4871c33f23e44f353119e92f969f"},
 ]
 pytest = [
-    {file = "pytest-5.3.5-py3-none-any.whl", hash = "sha256:ff615c761e25eb25df19edddc0b970302d2a9091fbce0e7213298d85fb61fef6"},
-    {file = "pytest-5.3.5.tar.gz", hash = "sha256:0d5fe9189a148acc3c3eb2ac8e1ac0742cb7618c084f3d228baaec0c254b318d"},
+    {file = "pytest-5.4.1-py3-none-any.whl", hash = "sha256:0e5b30f5cb04e887b91b1ee519fa3d89049595f428c1db76e73bd7f17b09b172"},
+    {file = "pytest-5.4.1.tar.gz", hash = "sha256:84dde37075b8805f3d1f392cc47e38a0e59518fb46a431cfdaf7cf1ce805f970"},
 ]
 regex = [
     {file = "regex-2020.2.20-cp27-cp27m-win32.whl", hash = "sha256:99272d6b6a68c7ae4391908fc15f6b8c9a6c345a46b632d7fdb7ef6c883a2bbb"},
@@ -587,6 +587,6 @@ whoosh = [
     {file = "Whoosh-2.7.4.zip", hash = "sha256:e0857375f63e9041e03fedd5b7541f97cf78917ac1b6b06c1fcc9b45375dda69"},
 ]
 zipp = [
-    {file = "zipp-3.0.0-py3-none-any.whl", hash = "sha256:12248a63bbdf7548f89cb4c7cda4681e537031eda29c02ea29674bc6854460c2"},
-    {file = "zipp-3.0.0.tar.gz", hash = "sha256:7c0f8e91abc0dc07a5068f315c52cb30c66bfbc581e5b50704c8a2f6ebae794a"},
+    {file = "zipp-3.1.0-py3-none-any.whl", hash = "sha256:aa36550ff0c0b7ef7fa639055d797116ee891440eac1a56f378e2d3179e0320b"},
+    {file = "zipp-3.1.0.tar.gz", hash = "sha256:c599e4d75c98f6798c509911d08a22e6c021d074469042177c8c86fb92eefd96"},
 ]

	geminispace.info gemini search engine
	git clone https://git.clttr.info/geminispace.info.git
	Log (Feed) \| Files \| Refs (Tags) \| README \| LICENSE

M	README.md	\|	22	----------------------
M	gus/crawl.py	\|	94	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------------------
M	gus/serve.py	\|	13	+++++++------
M	poetry.lock	\|	58	+++++++++++++++++++++++++++++-----------------------------