commit 220709fdd46669fb36fdc321dc82f95205fe0b2d
parent e537dcdb4f9686d26396870a0b263d72c4a04519
Author: Natalie Pendragon <natpen@natpen.net>
Date: Sat, 14 Mar 2020 22:50:06 -0400
Improve it all
Diffstat:
M | README.md | | | 22 | ---------------------- |
M | gus/crawl.py | | | 94 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------------- |
M | gus/serve.py | | | 13 | +++++++------ |
M | poetry.lock | | | 58 | +++++++++++++++++++++++++++++----------------------------- |
4 files changed, 106 insertions(+), 81 deletions(-)
diff --git a/README.md b/README.md
@@ -30,19 +30,6 @@ as this guide to [mailing list etiquette](https://man.sr.ht/lists.sr.ht/etiquett
- **general code cleanup**: most notably crawl.py. There are a lot
of hacks in there that I put in for expediency, but haven't
taken the time to address.
-- **improve the indexing**: currently, the url is prepended to
- the page content, and everything is simply indexed with the
- default indexer. I think a better solution would be to have
- urls indexed with a url-specific indexer that doesn't do
- things like, e.g., porter-stemming, which I assume the
- default indexer is doing.
-- **extend the index to handle binary links in Geminispace**:
- currently, there's a hack in the code to simply skip
- anything that looks like a binary link. I think with the
- above improvement to how indexing works, they could be
- made very effectively searchable. Also in this vein,
- binary links should be identified via their mime types
- probably, instead of the suffix hack used now.
- **add tests**: there aren't any yet!
- **add functionality to create a mock index**: this would
be useful for local hacking on serve.py, so one does
@@ -55,13 +42,4 @@ as this guide to [mailing list etiquette](https://man.sr.ht/lists.sr.ht/etiquett
solution will become increasingly unappealing as the amount
of content, and thus amount of search hits, in Geminispace
grows).
-- **extend the index to handle query links in Geminispace**:
- currently, there's a hack in the code to simply skip
- query links based on the response status code. These should
- be indexed, probably with their urls and their query prompt
- texts as the indexed content.
-- **allow seedlist additions from the site itself**: this
- would allow anyone to ensure their content gets crawled,
- even if there is currently no path from the current
- seedlist to their content.
- **track freshness of content**
diff --git a/gus/crawl.py b/gus/crawl.py
@@ -2,7 +2,7 @@ import pathlib
import re
import shutil
from urllib import robotparser
-from urllib.parse import urlparse, urlunparse
+from urllib.parse import unquote, urljoin, urlparse, urlunparse, uses_relative, uses_netloc
import gusmobile as gemini
from whoosh.analysis import FancyAnalyzer
@@ -11,6 +11,11 @@ from whoosh.index import create_in
from gus.whoosh_extensions import UrlAnalyzer
+# hack(natpen): the built-in methods in urllib need to know the
+# Gemini protocol exists
+uses_relative.append("gemini")
+uses_netloc.append("gemini")
+
INDEX_DIR = "index"
SEED_URLS = [
@@ -29,6 +34,7 @@ SEED_URLS = [
"gemini://dump.royniang.com",
"gemini://konpeito.media",
"gemini://gemini.68kmentat.com",
+ # "gemini://envs.net",
]
@@ -48,45 +54,74 @@ def create_index(index_dir):
analyzer=FancyAnalyzer(),
spelling=True,
),
+ prompt=TEXT(
+ analyzer=FancyAnalyzer(),
+ stored=True,
+ ),
)
index = create_in("index", schema)
index_writer = index.writer()
return index_writer
-def clean_links(links):
+def clean_links(links, current_url):
clean_links = []
for link in links:
clean_link = link
u = urlparse(link)
- if u.scheme is not None and u.scheme != "gemini":
+ if u.scheme != '' and u.scheme != "gemini":
continue
- if u.port is None:
- clean_link = clean_link.replace(u.hostname, u.hostname+":1965")
+ if u.netloc == '':
+ # relative link
+ clean_link = urljoin(current_url, clean_link)
+ u = urlparse(clean_link)
+ if u.port == 1965:
+ clean_link = clean_link.replace(u.hostname+":1965", u.hostname, 1)
if u.scheme is None:
- clean_link = clean_link.replace(u.hostname, "gemini://"+u.hostname)
+ clean_link = clean_link.replace(u.hostname, "gemini://"+u.hostname, 1)
clean_links.append(clean_link)
return clean_links
def normalize_gemini_url(url):
+ if "%" in url:
+ url = unquote(url)
u = urlparse(url.lower().strip().rstrip('/'), 'gemini')
+ if u.hostname is None:
+ return None
url_normalized = urlunparse(u)
- if u.port is None:
- url_normalized = url_normalized.replace(u.hostname, u.hostname+":1965")
+ if u.port == 1965:
+ url_normalized = url_normalized.replace(u.hostname+":1965", u.hostname, 1)
if u.scheme is None:
- url_normalized = url_normalized.replace(u.hostname, "gemini://"+u.hostname)
+ url_normalized = url_normalized.replace(u.hostname, "gemini://"+u.hostname, 1)
return url_normalized
-def extract_gemini_links(content):
+def extract_gemini_links(content, current_url):
link_pattern = "=>\s(\S+)"
links = re.findall(link_pattern, content)
- gemini_links = clean_links(links)
+ gemini_links = clean_links(links, current_url)
return gemini_links
+def index_binary(response):
+ print("INDEXING BINARY...")
+ index_writer.add_document(
+ url=response.url,
+ content_type=response.content_type,
+ )
+
+
+def index_prompt(response):
+ print("INDEXING PROMPT...")
+ index_writer.add_document(
+ url=response.url,
+ content_type="input",
+ prompt=response.prompt,
+ )
+
+
def index_content(response):
- print("INDEXING...")
+ print("INDEXING CONTENT...")
index_writer.add_document(
url=response.url,
content_type=response.content_type,
@@ -113,12 +148,13 @@ def crawl_url(url):
u = urlparse(url, 'gemini')
url = urlunparse(u)
path = u.path.lower().rstrip().rstrip('/')
- if path.endswith(".mp3") or path.endswith(".png") or path.endswith(".jpg") or path.endswith(".jpeg") or path.endswith(".zip"):
- print("BINARY SKIP : %s" % url)
- print("--------------------------")
- return
normalized_url = normalize_gemini_url(url)
- if normalized_url.startswith("gemini://example.org"):
+ if normalized_url is None or \
+ normalized_url.startswith("gemini://example.org") or \
+ normalized_url.startswith("gemini://example.com") or \
+ normalized_url.startswith("gemini://gemini.conman.org/test"):
+ print("MANUAL EXCLUSION SKIP : %s" % url)
+ print("--------------------------")
return
robots_file = get_robots_file(normalized_url)
if robots_file is not None:
@@ -127,7 +163,6 @@ def crawl_url(url):
print("ROBOTS SKIP : %s" % url)
print("--------------------------")
return
-
if normalized_url in visited_urls:
print("ALREADY SEEN : %s" % url)
print("--------------------------")
@@ -142,18 +177,29 @@ def crawl_url(url):
elif r.status.startswith("3"):
# redirect status
print("REDIRECT : %s -> %s" % (url, r.url))
+ visited_urls.pop()
crawl_url(r.url)
+ elif r.status.startswith("1"):
+ # input status
+ print("URL : %s" % r.url)
+ print("STATUS : %s" % r.status)
+ print("PROMPT : %s" % r.prompt)
+ index_prompt(r)
+ print("--------------------------")
elif r.status.startswith("2"):
# success status
print("URL : %s" % r.url)
print("STATUS : %s" % r.status)
- print("STATUS META : %s" % r.status_meta)
print("CONTENT TYPE : %s" % r.content_type)
- index_content(r)
- print("--------------------------")
- gemini_links = extract_gemini_links(r.content)
- for link in gemini_links:
- crawl_url(link)
+ if r.content_type.startswith("text/"):
+ index_content(r)
+ print("--------------------------")
+ gemini_links = extract_gemini_links(r.content, r.url)
+ for link in gemini_links:
+ crawl_url(link)
+ else:
+ index_binary(r)
+ print("--------------------------")
else:
# input, error, etc (all other statuses)
print("UNHANDLED : %s" % url)
diff --git a/gus/serve.py b/gus/serve.py
@@ -27,7 +27,7 @@ def _render_header():
def _render_footer():
return [
"",
- "=> /add-seed See any missing results? Add a gemini URL to the index here."
+ "=> /add-seed See any missing results? Let GUS know your gemini URL exists."
]
@@ -66,7 +66,7 @@ def index(request):
def _search_index(query):
- query = MultifieldParser(["content", "url"], ix.schema).parse(query)
+ query = MultifieldParser(["content", "url", "prompt"], ix.schema).parse(query)
results = searcher.search(query)
return (
len(results),
@@ -98,17 +98,18 @@ def _render_results(results):
for i, result in enumerate(results):
if i > 0:
data.append("")
+ prompt_suffix = ""
+ if result[2] == "input":
+ prompt_suffix = ": {}".format(result[4])
data.append("=> {}".format(result[1]))
- data.append("{} (score: {:.2f})".format(result[2], result[0]))
+ data.append("{}{} (score: {:.2f})".format(result[2], prompt_suffix, result[0]))
return data
def _render_results_header(query, num_results):
return [
"",
- "| \"{}\" - {} hits".format(query, num_results),
- # "| {} hits".format(num_results),
- # "===========================",
+ "> \"{}\" - {} hits".format(query, num_results),
""
]
diff --git a/poetry.lock b/poetry.lock
@@ -71,8 +71,8 @@ category = "dev"
description = "Composable command line interface toolkit"
name = "click"
optional = false
-python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
-version = "7.0"
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
+version = "7.1.1"
[[package]]
category = "dev"
@@ -89,7 +89,7 @@ description = "Decorators for Humans"
name = "decorator"
optional = false
python-versions = ">=2.6, !=3.0.*, !=3.1.*"
-version = "4.4.1"
+version = "4.4.2"
[[package]]
category = "main"
@@ -100,7 +100,7 @@ python-versions = "*"
version = "0.1.0"
[package.source]
-reference = "72e639c07d06c48c5b545d238e35e406c1aece89"
+reference = "123c73b4e06c89781543dfcba55581d6a3931129"
type = "git"
url = "https://git.sr.ht/~natpen/gusmobile"
[[package]]
@@ -125,7 +125,7 @@ description = "IPython: Productive Interactive Computing"
name = "ipython"
optional = false
python-versions = ">=3.6"
-version = "7.12.0"
+version = "7.13.0"
[package.dependencies]
appnope = "*"
@@ -141,7 +141,7 @@ setuptools = ">=18.5"
traitlets = ">=4.2"
[package.extras]
-all = ["ipyparallel", "requests", "notebook", "qtconsole", "ipywidgets", "pygments", "nbconvert", "testpath", "Sphinx (>=1.3)", "nbformat", "numpy (>=1.14)", "ipykernel", "nose (>=0.10.1)"]
+all = ["numpy (>=1.14)", "testpath", "notebook", "nose (>=0.10.1)", "nbconvert", "requests", "ipywidgets", "qtconsole", "ipyparallel", "Sphinx (>=1.3)", "pygments", "nbformat", "ipykernel"]
doc = ["Sphinx (>=1.3)"]
kernel = ["ipykernel"]
nbconvert = ["nbconvert"]
@@ -196,7 +196,7 @@ description = "Core utilities for Python packages"
name = "packaging"
optional = false
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
-version = "20.1"
+version = "20.3"
[package.dependencies]
pyparsing = ">=2.0.2"
@@ -262,8 +262,8 @@ category = "dev"
description = "Library for building powerful interactive command lines in Python"
name = "prompt-toolkit"
optional = false
-python-versions = ">=3.6"
-version = "3.0.3"
+python-versions = ">=3.6.1"
+version = "3.0.4"
[package.dependencies]
wcwidth = "*"
@@ -290,8 +290,8 @@ category = "dev"
description = "Pygments is a syntax highlighting package written in Python."
name = "pygments"
optional = false
-python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
-version = "2.5.2"
+python-versions = ">=3.5"
+version = "2.6.1"
[[package]]
category = "dev"
@@ -307,7 +307,7 @@ description = "pytest: simple powerful testing with Python"
name = "pytest"
optional = false
python-versions = ">=3.5"
-version = "5.3.5"
+version = "5.4.1"
[package.dependencies]
atomicwrites = ">=1.0"
@@ -398,7 +398,7 @@ marker = "python_version < \"3.8\""
name = "zipp"
optional = false
python-versions = ">=3.6"
-version = "3.0.0"
+version = "3.1.0"
[package.extras]
docs = ["sphinx", "jaraco.packaging (>=3.2)", "rst.linker (>=1.9)"]
@@ -434,16 +434,16 @@ black = [
{file = "black-19.10b0.tar.gz", hash = "sha256:c2edb73a08e9e0e6f65a0e6af18b059b8b1cdd5bef997d7a0b181df93dc81539"},
]
click = [
- {file = "Click-7.0-py2.py3-none-any.whl", hash = "sha256:2335065e6395b9e67ca716de5f7526736bfa6ceead690adf616d925bdc622b13"},
- {file = "Click-7.0.tar.gz", hash = "sha256:5b94b49521f6456670fdb30cd82a4eca9412788a93fa6dd6df72c94d5a8ff2d7"},
+ {file = "click-7.1.1-py2.py3-none-any.whl", hash = "sha256:e345d143d80bf5ee7534056164e5e112ea5e22716bbb1ce727941f4c8b471b9a"},
+ {file = "click-7.1.1.tar.gz", hash = "sha256:8a18b4ea89d8820c5d0c7da8a64b2c324b4dabb695804dbfea19b9be9d88c0cc"},
]
colorama = [
{file = "colorama-0.4.3-py2.py3-none-any.whl", hash = "sha256:7d73d2a99753107a36ac6b455ee49046802e59d9d076ef8e47b61499fa29afff"},
{file = "colorama-0.4.3.tar.gz", hash = "sha256:e96da0d330793e2cb9485e9ddfd918d456036c7149416295932478192f4436a1"},
]
decorator = [
- {file = "decorator-4.4.1-py2.py3-none-any.whl", hash = "sha256:5d19b92a3c8f7f101c8dd86afd86b0f061a8ce4540ab8cd401fa2542756bce6d"},
- {file = "decorator-4.4.1.tar.gz", hash = "sha256:54c38050039232e1db4ad7375cfce6748d7b41c29e95a081c8a6d2c30364a2ce"},
+ {file = "decorator-4.4.2-py2.py3-none-any.whl", hash = "sha256:41fa54c2a0cc4ba648be4fd43cff00aedf5b9465c9bf18d64325bc225f08f760"},
+ {file = "decorator-4.4.2.tar.gz", hash = "sha256:e3a62f0520172440ca0dcc823749319382e377f37f140a0b99ef45fecb84bfe7"},
]
gusmobile = []
importlib-metadata = [
@@ -451,8 +451,8 @@ importlib-metadata = [
{file = "importlib_metadata-1.5.0.tar.gz", hash = "sha256:06f5b3a99029c7134207dd882428a66992a9de2bef7c2b699b5641f9886c3302"},
]
ipython = [
- {file = "ipython-7.12.0-py3-none-any.whl", hash = "sha256:f6689108b1734501d3b59c84427259fd5ac5141afe2e846cfa8598eb811886c9"},
- {file = "ipython-7.12.0.tar.gz", hash = "sha256:d9459e7237e2e5858738ff9c3e26504b79899b58a6d49e574d352493d80684c6"},
+ {file = "ipython-7.13.0-py3-none-any.whl", hash = "sha256:eb8d075de37f678424527b5ef6ea23f7b80240ca031c2dd6de5879d687a65333"},
+ {file = "ipython-7.13.0.tar.gz", hash = "sha256:ca478e52ae1f88da0102360e57e528b92f3ae4316aabac80a2cd7f7ab2efb48a"},
]
ipython-genutils = [
{file = "ipython_genutils-0.2.0-py2.py3-none-any.whl", hash = "sha256:72dd37233799e619666c9f639a9da83c34013a73e8bbc79a7a6348d93c61fab8"},
@@ -471,8 +471,8 @@ more-itertools = [
{file = "more_itertools-8.2.0-py3-none-any.whl", hash = "sha256:5dd8bcf33e5f9513ffa06d5ad33d78f31e1931ac9a18f33d37e77a180d393a7c"},
]
packaging = [
- {file = "packaging-20.1-py2.py3-none-any.whl", hash = "sha256:170748228214b70b672c581a3dd610ee51f733018650740e98c7df862a583f73"},
- {file = "packaging-20.1.tar.gz", hash = "sha256:e665345f9eef0c621aa0bf2f8d78cf6d21904eef16a93f020240b704a57f1334"},
+ {file = "packaging-20.3-py2.py3-none-any.whl", hash = "sha256:82f77b9bee21c1bafbf35a84905d604d5d1223801d639cf3ed140bd651c08752"},
+ {file = "packaging-20.3.tar.gz", hash = "sha256:3c292b474fda1671ec57d46d739d072bfd495a4f51ad01a055121d81e952b7a3"},
]
parso = [
{file = "parso-0.6.2-py2.py3-none-any.whl", hash = "sha256:8515fc12cfca6ee3aa59138741fc5624d62340c97e401c74875769948d4f2995"},
@@ -495,8 +495,8 @@ pluggy = [
{file = "pluggy-0.13.1.tar.gz", hash = "sha256:15b2acde666561e1298d71b523007ed7364de07029219b604cf808bfa1c765b0"},
]
prompt-toolkit = [
- {file = "prompt_toolkit-3.0.3-py3-none-any.whl", hash = "sha256:c93e53af97f630f12f5f62a3274e79527936ed466f038953dfa379d4941f651a"},
- {file = "prompt_toolkit-3.0.3.tar.gz", hash = "sha256:a402e9bf468b63314e37460b68ba68243d55b2f8c4d0192f85a019af3945050e"},
+ {file = "prompt_toolkit-3.0.4-py3-none-any.whl", hash = "sha256:859e1b205b6cf6a51fa57fa34202e45365cf58f8338f0ee9f4e84a4165b37d5b"},
+ {file = "prompt_toolkit-3.0.4.tar.gz", hash = "sha256:ebe6b1b08c888b84c50d7f93dee21a09af39860144ff6130aadbd61ae8d29783"},
]
ptyprocess = [
{file = "ptyprocess-0.6.0-py2.py3-none-any.whl", hash = "sha256:d7cc528d76e76342423ca640335bd3633420dc1366f258cb31d05e865ef5ca1f"},
@@ -507,16 +507,16 @@ py = [
{file = "py-1.8.1.tar.gz", hash = "sha256:5e27081401262157467ad6e7f851b7aa402c5852dbcb3dae06768434de5752aa"},
]
pygments = [
- {file = "Pygments-2.5.2-py2.py3-none-any.whl", hash = "sha256:2a3fe295e54a20164a9df49c75fa58526d3be48e14aceba6d6b1e8ac0bfd6f1b"},
- {file = "Pygments-2.5.2.tar.gz", hash = "sha256:98c8aa5a9f778fcd1026a17361ddaf7330d1b7c62ae97c3bb0ae73e0b9b6b0fe"},
+ {file = "Pygments-2.6.1-py3-none-any.whl", hash = "sha256:ff7a40b4860b727ab48fad6360eb351cc1b33cbf9b15a0f689ca5353e9463324"},
+ {file = "Pygments-2.6.1.tar.gz", hash = "sha256:647344a061c249a3b74e230c739f434d7ea4d8b1d5f3721bc0f3558049b38f44"},
]
pyparsing = [
{file = "pyparsing-2.4.6-py2.py3-none-any.whl", hash = "sha256:c342dccb5250c08d45fd6f8b4a559613ca603b57498511740e65cd11a2e7dcec"},
{file = "pyparsing-2.4.6.tar.gz", hash = "sha256:4c830582a84fb022400b85429791bc551f1f4871c33f23e44f353119e92f969f"},
]
pytest = [
- {file = "pytest-5.3.5-py3-none-any.whl", hash = "sha256:ff615c761e25eb25df19edddc0b970302d2a9091fbce0e7213298d85fb61fef6"},
- {file = "pytest-5.3.5.tar.gz", hash = "sha256:0d5fe9189a148acc3c3eb2ac8e1ac0742cb7618c084f3d228baaec0c254b318d"},
+ {file = "pytest-5.4.1-py3-none-any.whl", hash = "sha256:0e5b30f5cb04e887b91b1ee519fa3d89049595f428c1db76e73bd7f17b09b172"},
+ {file = "pytest-5.4.1.tar.gz", hash = "sha256:84dde37075b8805f3d1f392cc47e38a0e59518fb46a431cfdaf7cf1ce805f970"},
]
regex = [
{file = "regex-2020.2.20-cp27-cp27m-win32.whl", hash = "sha256:99272d6b6a68c7ae4391908fc15f6b8c9a6c345a46b632d7fdb7ef6c883a2bbb"},
@@ -587,6 +587,6 @@ whoosh = [
{file = "Whoosh-2.7.4.zip", hash = "sha256:e0857375f63e9041e03fedd5b7541f97cf78917ac1b6b06c1fcc9b45375dda69"},
]
zipp = [
- {file = "zipp-3.0.0-py3-none-any.whl", hash = "sha256:12248a63bbdf7548f89cb4c7cda4681e537031eda29c02ea29674bc6854460c2"},
- {file = "zipp-3.0.0.tar.gz", hash = "sha256:7c0f8e91abc0dc07a5068f315c52cb30c66bfbc581e5b50704c8a2f6ebae794a"},
+ {file = "zipp-3.1.0-py3-none-any.whl", hash = "sha256:aa36550ff0c0b7ef7fa639055d797116ee891440eac1a56f378e2d3179e0320b"},
+ {file = "zipp-3.1.0.tar.gz", hash = "sha256:c599e4d75c98f6798c509911d08a22e6c021d074469042177c8c86fb92eefd96"},
]