geminispace.info

gemini search engine
git clone https://git.clttr.info/geminispace.info.git
Log (Feed) | Files | Refs (Tags) | README | LICENSE

commit 220709fdd46669fb36fdc321dc82f95205fe0b2d
parent e537dcdb4f9686d26396870a0b263d72c4a04519
Author: Natalie Pendragon <natpen@natpen.net>
Date:   Sat, 14 Mar 2020 22:50:06 -0400

Improve it all

Diffstat:
MREADME.md | 22----------------------
Mgus/crawl.py | 94+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------------------
Mgus/serve.py | 13+++++++------
Mpoetry.lock | 58+++++++++++++++++++++++++++++-----------------------------
4 files changed, 106 insertions(+), 81 deletions(-)

diff --git a/README.md b/README.md @@ -30,19 +30,6 @@ as this guide to [mailing list etiquette](https://man.sr.ht/lists.sr.ht/etiquett - **general code cleanup**: most notably crawl.py. There are a lot of hacks in there that I put in for expediency, but haven't taken the time to address. -- **improve the indexing**: currently, the url is prepended to - the page content, and everything is simply indexed with the - default indexer. I think a better solution would be to have - urls indexed with a url-specific indexer that doesn't do - things like, e.g., porter-stemming, which I assume the - default indexer is doing. -- **extend the index to handle binary links in Geminispace**: - currently, there's a hack in the code to simply skip - anything that looks like a binary link. I think with the - above improvement to how indexing works, they could be - made very effectively searchable. Also in this vein, - binary links should be identified via their mime types - probably, instead of the suffix hack used now. - **add tests**: there aren't any yet! - **add functionality to create a mock index**: this would be useful for local hacking on serve.py, so one does @@ -55,13 +42,4 @@ as this guide to [mailing list etiquette](https://man.sr.ht/lists.sr.ht/etiquett solution will become increasingly unappealing as the amount of content, and thus amount of search hits, in Geminispace grows). -- **extend the index to handle query links in Geminispace**: - currently, there's a hack in the code to simply skip - query links based on the response status code. These should - be indexed, probably with their urls and their query prompt - texts as the indexed content. -- **allow seedlist additions from the site itself**: this - would allow anyone to ensure their content gets crawled, - even if there is currently no path from the current - seedlist to their content. - **track freshness of content** diff --git a/gus/crawl.py b/gus/crawl.py @@ -2,7 +2,7 @@ import pathlib import re import shutil from urllib import robotparser -from urllib.parse import urlparse, urlunparse +from urllib.parse import unquote, urljoin, urlparse, urlunparse, uses_relative, uses_netloc import gusmobile as gemini from whoosh.analysis import FancyAnalyzer @@ -11,6 +11,11 @@ from whoosh.index import create_in from gus.whoosh_extensions import UrlAnalyzer +# hack(natpen): the built-in methods in urllib need to know the +# Gemini protocol exists +uses_relative.append("gemini") +uses_netloc.append("gemini") + INDEX_DIR = "index" SEED_URLS = [ @@ -29,6 +34,7 @@ SEED_URLS = [ "gemini://dump.royniang.com", "gemini://konpeito.media", "gemini://gemini.68kmentat.com", + # "gemini://envs.net", ] @@ -48,45 +54,74 @@ def create_index(index_dir): analyzer=FancyAnalyzer(), spelling=True, ), + prompt=TEXT( + analyzer=FancyAnalyzer(), + stored=True, + ), ) index = create_in("index", schema) index_writer = index.writer() return index_writer -def clean_links(links): +def clean_links(links, current_url): clean_links = [] for link in links: clean_link = link u = urlparse(link) - if u.scheme is not None and u.scheme != "gemini": + if u.scheme != '' and u.scheme != "gemini": continue - if u.port is None: - clean_link = clean_link.replace(u.hostname, u.hostname+":1965") + if u.netloc == '': + # relative link + clean_link = urljoin(current_url, clean_link) + u = urlparse(clean_link) + if u.port == 1965: + clean_link = clean_link.replace(u.hostname+":1965", u.hostname, 1) if u.scheme is None: - clean_link = clean_link.replace(u.hostname, "gemini://"+u.hostname) + clean_link = clean_link.replace(u.hostname, "gemini://"+u.hostname, 1) clean_links.append(clean_link) return clean_links def normalize_gemini_url(url): + if "%" in url: + url = unquote(url) u = urlparse(url.lower().strip().rstrip('/'), 'gemini') + if u.hostname is None: + return None url_normalized = urlunparse(u) - if u.port is None: - url_normalized = url_normalized.replace(u.hostname, u.hostname+":1965") + if u.port == 1965: + url_normalized = url_normalized.replace(u.hostname+":1965", u.hostname, 1) if u.scheme is None: - url_normalized = url_normalized.replace(u.hostname, "gemini://"+u.hostname) + url_normalized = url_normalized.replace(u.hostname, "gemini://"+u.hostname, 1) return url_normalized -def extract_gemini_links(content): +def extract_gemini_links(content, current_url): link_pattern = "=>\s(\S+)" links = re.findall(link_pattern, content) - gemini_links = clean_links(links) + gemini_links = clean_links(links, current_url) return gemini_links +def index_binary(response): + print("INDEXING BINARY...") + index_writer.add_document( + url=response.url, + content_type=response.content_type, + ) + + +def index_prompt(response): + print("INDEXING PROMPT...") + index_writer.add_document( + url=response.url, + content_type="input", + prompt=response.prompt, + ) + + def index_content(response): - print("INDEXING...") + print("INDEXING CONTENT...") index_writer.add_document( url=response.url, content_type=response.content_type, @@ -113,12 +148,13 @@ def crawl_url(url): u = urlparse(url, 'gemini') url = urlunparse(u) path = u.path.lower().rstrip().rstrip('/') - if path.endswith(".mp3") or path.endswith(".png") or path.endswith(".jpg") or path.endswith(".jpeg") or path.endswith(".zip"): - print("BINARY SKIP : %s" % url) - print("--------------------------") - return normalized_url = normalize_gemini_url(url) - if normalized_url.startswith("gemini://example.org"): + if normalized_url is None or \ + normalized_url.startswith("gemini://example.org") or \ + normalized_url.startswith("gemini://example.com") or \ + normalized_url.startswith("gemini://gemini.conman.org/test"): + print("MANUAL EXCLUSION SKIP : %s" % url) + print("--------------------------") return robots_file = get_robots_file(normalized_url) if robots_file is not None: @@ -127,7 +163,6 @@ def crawl_url(url): print("ROBOTS SKIP : %s" % url) print("--------------------------") return - if normalized_url in visited_urls: print("ALREADY SEEN : %s" % url) print("--------------------------") @@ -142,18 +177,29 @@ def crawl_url(url): elif r.status.startswith("3"): # redirect status print("REDIRECT : %s -> %s" % (url, r.url)) + visited_urls.pop() crawl_url(r.url) + elif r.status.startswith("1"): + # input status + print("URL : %s" % r.url) + print("STATUS : %s" % r.status) + print("PROMPT : %s" % r.prompt) + index_prompt(r) + print("--------------------------") elif r.status.startswith("2"): # success status print("URL : %s" % r.url) print("STATUS : %s" % r.status) - print("STATUS META : %s" % r.status_meta) print("CONTENT TYPE : %s" % r.content_type) - index_content(r) - print("--------------------------") - gemini_links = extract_gemini_links(r.content) - for link in gemini_links: - crawl_url(link) + if r.content_type.startswith("text/"): + index_content(r) + print("--------------------------") + gemini_links = extract_gemini_links(r.content, r.url) + for link in gemini_links: + crawl_url(link) + else: + index_binary(r) + print("--------------------------") else: # input, error, etc (all other statuses) print("UNHANDLED : %s" % url) diff --git a/gus/serve.py b/gus/serve.py @@ -27,7 +27,7 @@ def _render_header(): def _render_footer(): return [ "", - "=> /add-seed See any missing results? Add a gemini URL to the index here." + "=> /add-seed See any missing results? Let GUS know your gemini URL exists." ] @@ -66,7 +66,7 @@ def index(request): def _search_index(query): - query = MultifieldParser(["content", "url"], ix.schema).parse(query) + query = MultifieldParser(["content", "url", "prompt"], ix.schema).parse(query) results = searcher.search(query) return ( len(results), @@ -98,17 +98,18 @@ def _render_results(results): for i, result in enumerate(results): if i > 0: data.append("") + prompt_suffix = "" + if result[2] == "input": + prompt_suffix = ": {}".format(result[4]) data.append("=> {}".format(result[1])) - data.append("{} (score: {:.2f})".format(result[2], result[0])) + data.append("{}{} (score: {:.2f})".format(result[2], prompt_suffix, result[0])) return data def _render_results_header(query, num_results): return [ "", - "| \"{}\" - {} hits".format(query, num_results), - # "| {} hits".format(num_results), - # "===========================", + "> \"{}\" - {} hits".format(query, num_results), "" ] diff --git a/poetry.lock b/poetry.lock @@ -71,8 +71,8 @@ category = "dev" description = "Composable command line interface toolkit" name = "click" optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" -version = "7.0" +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +version = "7.1.1" [[package]] category = "dev" @@ -89,7 +89,7 @@ description = "Decorators for Humans" name = "decorator" optional = false python-versions = ">=2.6, !=3.0.*, !=3.1.*" -version = "4.4.1" +version = "4.4.2" [[package]] category = "main" @@ -100,7 +100,7 @@ python-versions = "*" version = "0.1.0" [package.source] -reference = "72e639c07d06c48c5b545d238e35e406c1aece89" +reference = "123c73b4e06c89781543dfcba55581d6a3931129" type = "git" url = "https://git.sr.ht/~natpen/gusmobile" [[package]] @@ -125,7 +125,7 @@ description = "IPython: Productive Interactive Computing" name = "ipython" optional = false python-versions = ">=3.6" -version = "7.12.0" +version = "7.13.0" [package.dependencies] appnope = "*" @@ -141,7 +141,7 @@ setuptools = ">=18.5" traitlets = ">=4.2" [package.extras] -all = ["ipyparallel", "requests", "notebook", "qtconsole", "ipywidgets", "pygments", "nbconvert", "testpath", "Sphinx (>=1.3)", "nbformat", "numpy (>=1.14)", "ipykernel", "nose (>=0.10.1)"] +all = ["numpy (>=1.14)", "testpath", "notebook", "nose (>=0.10.1)", "nbconvert", "requests", "ipywidgets", "qtconsole", "ipyparallel", "Sphinx (>=1.3)", "pygments", "nbformat", "ipykernel"] doc = ["Sphinx (>=1.3)"] kernel = ["ipykernel"] nbconvert = ["nbconvert"] @@ -196,7 +196,7 @@ description = "Core utilities for Python packages" name = "packaging" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" -version = "20.1" +version = "20.3" [package.dependencies] pyparsing = ">=2.0.2" @@ -262,8 +262,8 @@ category = "dev" description = "Library for building powerful interactive command lines in Python" name = "prompt-toolkit" optional = false -python-versions = ">=3.6" -version = "3.0.3" +python-versions = ">=3.6.1" +version = "3.0.4" [package.dependencies] wcwidth = "*" @@ -290,8 +290,8 @@ category = "dev" description = "Pygments is a syntax highlighting package written in Python." name = "pygments" optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" -version = "2.5.2" +python-versions = ">=3.5" +version = "2.6.1" [[package]] category = "dev" @@ -307,7 +307,7 @@ description = "pytest: simple powerful testing with Python" name = "pytest" optional = false python-versions = ">=3.5" -version = "5.3.5" +version = "5.4.1" [package.dependencies] atomicwrites = ">=1.0" @@ -398,7 +398,7 @@ marker = "python_version < \"3.8\"" name = "zipp" optional = false python-versions = ">=3.6" -version = "3.0.0" +version = "3.1.0" [package.extras] docs = ["sphinx", "jaraco.packaging (>=3.2)", "rst.linker (>=1.9)"] @@ -434,16 +434,16 @@ black = [ {file = "black-19.10b0.tar.gz", hash = "sha256:c2edb73a08e9e0e6f65a0e6af18b059b8b1cdd5bef997d7a0b181df93dc81539"}, ] click = [ - {file = "Click-7.0-py2.py3-none-any.whl", hash = "sha256:2335065e6395b9e67ca716de5f7526736bfa6ceead690adf616d925bdc622b13"}, - {file = "Click-7.0.tar.gz", hash = "sha256:5b94b49521f6456670fdb30cd82a4eca9412788a93fa6dd6df72c94d5a8ff2d7"}, + {file = "click-7.1.1-py2.py3-none-any.whl", hash = "sha256:e345d143d80bf5ee7534056164e5e112ea5e22716bbb1ce727941f4c8b471b9a"}, + {file = "click-7.1.1.tar.gz", hash = "sha256:8a18b4ea89d8820c5d0c7da8a64b2c324b4dabb695804dbfea19b9be9d88c0cc"}, ] colorama = [ {file = "colorama-0.4.3-py2.py3-none-any.whl", hash = "sha256:7d73d2a99753107a36ac6b455ee49046802e59d9d076ef8e47b61499fa29afff"}, {file = "colorama-0.4.3.tar.gz", hash = "sha256:e96da0d330793e2cb9485e9ddfd918d456036c7149416295932478192f4436a1"}, ] decorator = [ - {file = "decorator-4.4.1-py2.py3-none-any.whl", hash = "sha256:5d19b92a3c8f7f101c8dd86afd86b0f061a8ce4540ab8cd401fa2542756bce6d"}, - {file = "decorator-4.4.1.tar.gz", hash = "sha256:54c38050039232e1db4ad7375cfce6748d7b41c29e95a081c8a6d2c30364a2ce"}, + {file = "decorator-4.4.2-py2.py3-none-any.whl", hash = "sha256:41fa54c2a0cc4ba648be4fd43cff00aedf5b9465c9bf18d64325bc225f08f760"}, + {file = "decorator-4.4.2.tar.gz", hash = "sha256:e3a62f0520172440ca0dcc823749319382e377f37f140a0b99ef45fecb84bfe7"}, ] gusmobile = [] importlib-metadata = [ @@ -451,8 +451,8 @@ importlib-metadata = [ {file = "importlib_metadata-1.5.0.tar.gz", hash = "sha256:06f5b3a99029c7134207dd882428a66992a9de2bef7c2b699b5641f9886c3302"}, ] ipython = [ - {file = "ipython-7.12.0-py3-none-any.whl", hash = "sha256:f6689108b1734501d3b59c84427259fd5ac5141afe2e846cfa8598eb811886c9"}, - {file = "ipython-7.12.0.tar.gz", hash = "sha256:d9459e7237e2e5858738ff9c3e26504b79899b58a6d49e574d352493d80684c6"}, + {file = "ipython-7.13.0-py3-none-any.whl", hash = "sha256:eb8d075de37f678424527b5ef6ea23f7b80240ca031c2dd6de5879d687a65333"}, + {file = "ipython-7.13.0.tar.gz", hash = "sha256:ca478e52ae1f88da0102360e57e528b92f3ae4316aabac80a2cd7f7ab2efb48a"}, ] ipython-genutils = [ {file = "ipython_genutils-0.2.0-py2.py3-none-any.whl", hash = "sha256:72dd37233799e619666c9f639a9da83c34013a73e8bbc79a7a6348d93c61fab8"}, @@ -471,8 +471,8 @@ more-itertools = [ {file = "more_itertools-8.2.0-py3-none-any.whl", hash = "sha256:5dd8bcf33e5f9513ffa06d5ad33d78f31e1931ac9a18f33d37e77a180d393a7c"}, ] packaging = [ - {file = "packaging-20.1-py2.py3-none-any.whl", hash = "sha256:170748228214b70b672c581a3dd610ee51f733018650740e98c7df862a583f73"}, - {file = "packaging-20.1.tar.gz", hash = "sha256:e665345f9eef0c621aa0bf2f8d78cf6d21904eef16a93f020240b704a57f1334"}, + {file = "packaging-20.3-py2.py3-none-any.whl", hash = "sha256:82f77b9bee21c1bafbf35a84905d604d5d1223801d639cf3ed140bd651c08752"}, + {file = "packaging-20.3.tar.gz", hash = "sha256:3c292b474fda1671ec57d46d739d072bfd495a4f51ad01a055121d81e952b7a3"}, ] parso = [ {file = "parso-0.6.2-py2.py3-none-any.whl", hash = "sha256:8515fc12cfca6ee3aa59138741fc5624d62340c97e401c74875769948d4f2995"}, @@ -495,8 +495,8 @@ pluggy = [ {file = "pluggy-0.13.1.tar.gz", hash = "sha256:15b2acde666561e1298d71b523007ed7364de07029219b604cf808bfa1c765b0"}, ] prompt-toolkit = [ - {file = "prompt_toolkit-3.0.3-py3-none-any.whl", hash = "sha256:c93e53af97f630f12f5f62a3274e79527936ed466f038953dfa379d4941f651a"}, - {file = "prompt_toolkit-3.0.3.tar.gz", hash = "sha256:a402e9bf468b63314e37460b68ba68243d55b2f8c4d0192f85a019af3945050e"}, + {file = "prompt_toolkit-3.0.4-py3-none-any.whl", hash = "sha256:859e1b205b6cf6a51fa57fa34202e45365cf58f8338f0ee9f4e84a4165b37d5b"}, + {file = "prompt_toolkit-3.0.4.tar.gz", hash = "sha256:ebe6b1b08c888b84c50d7f93dee21a09af39860144ff6130aadbd61ae8d29783"}, ] ptyprocess = [ {file = "ptyprocess-0.6.0-py2.py3-none-any.whl", hash = "sha256:d7cc528d76e76342423ca640335bd3633420dc1366f258cb31d05e865ef5ca1f"}, @@ -507,16 +507,16 @@ py = [ {file = "py-1.8.1.tar.gz", hash = "sha256:5e27081401262157467ad6e7f851b7aa402c5852dbcb3dae06768434de5752aa"}, ] pygments = [ - {file = "Pygments-2.5.2-py2.py3-none-any.whl", hash = "sha256:2a3fe295e54a20164a9df49c75fa58526d3be48e14aceba6d6b1e8ac0bfd6f1b"}, - {file = "Pygments-2.5.2.tar.gz", hash = "sha256:98c8aa5a9f778fcd1026a17361ddaf7330d1b7c62ae97c3bb0ae73e0b9b6b0fe"}, + {file = "Pygments-2.6.1-py3-none-any.whl", hash = "sha256:ff7a40b4860b727ab48fad6360eb351cc1b33cbf9b15a0f689ca5353e9463324"}, + {file = "Pygments-2.6.1.tar.gz", hash = "sha256:647344a061c249a3b74e230c739f434d7ea4d8b1d5f3721bc0f3558049b38f44"}, ] pyparsing = [ {file = "pyparsing-2.4.6-py2.py3-none-any.whl", hash = "sha256:c342dccb5250c08d45fd6f8b4a559613ca603b57498511740e65cd11a2e7dcec"}, {file = "pyparsing-2.4.6.tar.gz", hash = "sha256:4c830582a84fb022400b85429791bc551f1f4871c33f23e44f353119e92f969f"}, ] pytest = [ - {file = "pytest-5.3.5-py3-none-any.whl", hash = "sha256:ff615c761e25eb25df19edddc0b970302d2a9091fbce0e7213298d85fb61fef6"}, - {file = "pytest-5.3.5.tar.gz", hash = "sha256:0d5fe9189a148acc3c3eb2ac8e1ac0742cb7618c084f3d228baaec0c254b318d"}, + {file = "pytest-5.4.1-py3-none-any.whl", hash = "sha256:0e5b30f5cb04e887b91b1ee519fa3d89049595f428c1db76e73bd7f17b09b172"}, + {file = "pytest-5.4.1.tar.gz", hash = "sha256:84dde37075b8805f3d1f392cc47e38a0e59518fb46a431cfdaf7cf1ce805f970"}, ] regex = [ {file = "regex-2020.2.20-cp27-cp27m-win32.whl", hash = "sha256:99272d6b6a68c7ae4391908fc15f6b8c9a6c345a46b632d7fdb7ef6c883a2bbb"}, @@ -587,6 +587,6 @@ whoosh = [ {file = "Whoosh-2.7.4.zip", hash = "sha256:e0857375f63e9041e03fedd5b7541f97cf78917ac1b6b06c1fcc9b45375dda69"}, ] zipp = [ - {file = "zipp-3.0.0-py3-none-any.whl", hash = "sha256:12248a63bbdf7548f89cb4c7cda4681e537031eda29c02ea29674bc6854460c2"}, - {file = "zipp-3.0.0.tar.gz", hash = "sha256:7c0f8e91abc0dc07a5068f315c52cb30c66bfbc581e5b50704c8a2f6ebae794a"}, + {file = "zipp-3.1.0-py3-none-any.whl", hash = "sha256:aa36550ff0c0b7ef7fa639055d797116ee891440eac1a56f378e2d3179e0320b"}, + {file = "zipp-3.1.0.tar.gz", hash = "sha256:c599e4d75c98f6798c509911d08a22e6c021d074469042177c8c86fb92eefd96"}, ]