geminispace.info

gemini search engine
git clone https://git.clttr.info/geminispace.info.git
Log (Feed) | Files | Refs (Tags) | README | LICENSE

commit a3dd4578b051bf55728b9c250844287992b43792
parent e0ea8f1de508923e07d916898ada64527bae3e0d
Author: Natalie Pendragon <natpen@natpen.net>
Date:   Thu, 18 Jun 2020 06:58:28 -0400

[serve] Rearchitect serve to use templates and MVC pattern

Diffstat:
Mgus/lib/index_statistics.py | 2+-
Dgus/serve.py | 489-------------------------------------------------------------------------------
Mpoetry.lock | 64+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
Mpyproject.toml | 11++++++-----
Aserve/__init__.py | 3+++
Aserve/constants.py | 3+++
Aserve/main.py | 26++++++++++++++++++++++++++
Aserve/models.py | 117+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aserve/templates/about.gmi | 57+++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aserve/templates/add_seed.gmi | 4++++
Aserve/templates/fragments/footer.gmi | 3+++
Aserve/templates/fragments/header.gmi | 4++++
Aserve/templates/fragments/pager.gmi | 10++++++++++
Aserve/templates/index.gmi | 9+++++++++
Aserve/templates/known_hosts.gmi | 12++++++++++++
Aserve/templates/news.gmi | 40++++++++++++++++++++++++++++++++++++++++
Aserve/templates/search.gmi | 28++++++++++++++++++++++++++++
Aserve/templates/search_suggestions.gmi | 12++++++++++++
Aserve/templates/statistics.gmi | 35+++++++++++++++++++++++++++++++++++
Aserve/views.py | 110+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
20 files changed, 543 insertions(+), 496 deletions(-)

diff --git a/gus/lib/index_statistics.py b/gus/lib/index_statistics.py @@ -123,7 +123,7 @@ def load_last_statistics_from_file(filename): def deserialize_statistics_line(line): line_parts = line.split(",") - index_modification_time = line_parts[0] + index_modification_time = datetime.strptime(line_parts[0], "%Y-%m-%d") # discard line_parts[1], which is `was_destructive` page_count = line_parts[2] domain_count = line_parts[3] diff --git a/gus/serve.py b/gus/serve.py @@ -1,489 +0,0 @@ -""" -A gemini search engine frontend. -""" -import asyncio -from datetime import datetime -import math -import os -import re -from subprocess import call -import sys -import threading -from urllib.parse import quote - -import jetforce -from jetforce import Response, Status -from whoosh import highlight, qparser -from whoosh.index import open_dir - -from gus.crawl import run_crawl -from gus.lib.index_statistics import compute_index_statistics, load_last_statistics_from_file -from gus.lib.misc import bytes2human -from gus.lib.whoosh_extensions import GeminiFormatter, GeminiScorer - -INDEX_DIR = "index" -app = jetforce.JetforceApplication() -gemini_highlighter = highlight.Highlighter( - formatter=GeminiFormatter(), - fragmenter=highlight.ContextFragmenter(maxchars=160, surround=80), - scorer=GeminiScorer(), - order=highlight.SCORE, -) -crawl_thread_lock = threading.Lock() - -def load_and_compute_statistics(filename): - statistics = load_last_statistics_from_file(filename) - - # we want fresh data for the below figures, and they aren't persisted to file - # during non-destructive crawls, so recompute them! - index_statistics = compute_index_statistics(INDEX_DIR) - statistics["index_modification_time"] = index_statistics["index_modification_time"] - statistics["page_count"] = index_statistics["page_count"] - statistics["domain_count"] = index_statistics["domain_count"] - statistics["content_type_frequencies"] = index_statistics["content_type_frequencies"] - statistics["charset_frequencies"] = index_statistics["charset_frequencies"] - statistics["domains"] = index_statistics["domains"] - return statistics - - -last_statistics = load_and_compute_statistics("statistics.csv") - -def _render_header(): - return [ - "# GUS - Gemini Universal Search", - "", - "=> / Home", - "=> /search Search GUS" - ] - - -def _render_footer(): - return [ - "", - "=> /add-seed See any missing results? Let GUS know your gemini URL exists.", - "", - "Index updated on: {:%Y-%m-%d}".format(index_modification_time) - ] - - -def _render_index_statistics(): - d = [ - "", - "# GUS Statistics", - "", - "## Overall", - "", - "These figures are representative of the aggregate size of Geminispace at the time the current index was last updated on {:%Y-%m-%d}.".format(last_statistics["index_modification_time"]), - "", - "```", - "Page Count : {:>5}".format(last_statistics["page_count"]), - "Domain Count : {:>5}".format(last_statistics["domain_count"]), - "```", - "", - "## By Content Type", - "", - "These figures are representative of the number of pages seen per content type at the time the current index was last updated on {:%Y-%m-%d}.".format(last_statistics["index_modification_time"]), - "", - "```", - ] - for pair in last_statistics["content_type_frequencies"]: - d.append("{:>5} - {}".format(pair[1], pair[0])) - d.extend([ - "```", - "", - "## By Charset", - "", - "These figures are representative of the number of pages seen per content type at the time the current index was last updated on {:%Y-%m-%d}.".format(last_statistics["index_modification_time"]), - "", - "```", - ]) - for pair in last_statistics["charset_frequencies"]: - d.append("{:>5} - {}".format(pair[1], pair[0])) - d.append("```") - return d - - -def _render_known_hosts(): - d = [ - "", - "# Known Gemini Hosts", - "", - "Below are the hosts, or servers, in Geminispace, of which GUS is aware. Note that this list is auto-generated from the index, so if your host is not showing up here, it also won't have its content represented in GUS search results! If your server is missing, try the link at the bottom of this page to request an immediate crawl of a Gemini URL, after which your server should start showing up.", - "", - ] - # TODO: remove this `sorted` call after the next index generation - for domain in sorted(last_statistics["domains"]): - d.append("=> gemini://{} {}".format(domain, domain)) - return d - - -def _render_news(): - d = [ - "", - "# GUS News", - ] - news_items = [ - { - "date": "2020-06-12", - "content": "Added size of pages to each result, so users can see how much network bandwidth would be involved in following each link. Size is also provided as a valid query search filter now. Documentation for filtering by size can be found on the advanced searching section of the about page!", - }, - { - "date": "2020-06-03", - "content": "Added ability to search and filter by charset. Documentation for this feature can be found on the advanced searching section of the about page!", - }, - { - "date": "2020-05-21", - "content": "Added ability to search and filter by domain. Documentation for this feature can be found on the advanced searching section of the about page!", - }, - { - "date": "2020-05-21", - "content": "Added contextual highlights from result pages' content directly to GUS search results pages. This should give a nice preview of what to expect if you click through to the result!", - }, - { - "date": "2020-05-19", - "content": "Added instantaneous indexing of seed requests! Submit a seed request and your site should begin showing up in closer to a few minutes than a few days, as it tended to before.", - }, - { - "date": "2020-05-16", - "content": "Added verbose search result mode. This is meant as an aid to content creators in figuring out exactly when GUS crawled specific pages, and how those pages' content scores against other results in Geminispace. Documentation for this feature is available under the advanced searching section of the about page.", - }, - { - "date": "2020-05-14", - "content": "Added paging functionality, which will now show up at the bottom of search result pages that have more than one page of results.", - }, - { - "date": "2020-05-12", - "content": "Added Known Hosts and News, both of which you can find on the GUS homepage.", - }, - { - "date": "2020-05-11", - "content": "Added Index Statistics feature, which you can find on the GUS homepage.", - }, - { - "date": "2020-05-09", - "content": "Added ability to search and filter by content type. Documentation for this feature can be found on the about page! I'm really excited about this update, because it will allow users to do things like search for all the music in Geminispace, or search for all the downloadable epub content. Unsurprisingly, most of the current content is either `text/gemini` or `text/plain`, but there is still a smattering of other interesting content types already in existence in Geminispace.", - }, - { - "date": "2020-03-04", - "content": "Added search suggestions. When a search returns with no results, GUS will now attempt to find some lexicographically similar search suggestions that would yield results. Hopefully this will help ameliorate both typos in queries, as well as the slight paucity of content in these early days of Gemini.", - }, - { - "date": "2020-02-21", - "content": "Updated GUS indexing to respect robots.txt. Documentation for this feature can be found on the about page.", - }, - ] - news_items.sort(key=lambda item: item["date"], reverse=True) - for item in news_items: - d.extend(["", "## {}".format(item["date"]), item["content"]]) - return d - - -@app.route("") -def index(request): - data = _render_header() - data.extend([ - "=> /about About GUS", - "=> /statistics GUS Statistics", - "=> /known-hosts Known Gemini Hosts", - "=> /news GUS News", - "=> gemini://gemini.circumlunar.space Gemini Project information", - ]) - data.extend(_render_footer()) - return Response(Status.SUCCESS, "text/gemini", "\n".join(data)) - - -@app.route("/about") -def index(request): - data = _render_header() - data.extend([ - "", - "# About GUS", - "", - "GUS is a search engine for all content served over the Gemini Protocol. It can help you track down textual pages (e.g., `text/gemini`, `text/plain`, `text/markdown`) with content containing your search terms, but it can just as easily help you track down binary files (e.g., images, mp3s) which happen to be served over the Gemini protocol. GUS will only index content within Geminispace, and will not index links out to other protocols, like Http or Gopher.", - "", - "To control crawling of your site, you can use a robots.txt file, Place it in your document root directory such that a request for \"robots.txt\" will fetch it.", - "", - "GUS obeys User-agent of \"gus\" and \"*\".", - "", - "If you have questions about or ideas for GUS, please email me at natpen@natpen.net.", - "", - "# Advanced Searching", - "", - "## Filters", - "", - "To improve the quality of your search results, you can apply filters to constrain your search results in various dimensions. The currently implemented filters are:", - "* content_type", - "* domain", - "* charset", - "* size", - "", - "To filter by one of these, simply add it to your query followed by a colon, and the value you wish to filter by. Some examples of doing so follow.", - "", - "=> /search?content_type%3Aapplication/pdf application/pdf", - "=> /search?content_type%3Aaudio audio", - "=> /search?content_type%3Aimage/jpeg image/jpeg", - "=> /search?content_type%3Ainput input", - "", - "=> /search?domain%3Acircumlunar domain:circumlunar", - "=> /search?contextual%20domain%3Agus contextual domain:gus", - "", - "=> /search?computers%20content_type%3Agemini%20AND%20NOT%20charset%3AUS-ASCII computers content_type:gemini AND NOT charset:US-ASCII", - "=> /search?NOT%20charset%3Anone NOT charset:none", - "", - "Note that size works slightly different than the other filters, as it is numeric. Typically, you will want to limit your search results to those less than, or greater than, a certain size.", - "", - "=> /search?computer%20AND%20size%3A%3E2000 computer AND size:>2000", - "", - "For further inspiration on how to use these filters, you can visit both GUS' list of known hosts, as well as GUS' list of known content_types and charsets on the statistics page. Note that there is some nuance to the charset values, due to the fact that specifying them is optional, and if one does not specify, there is a default of utf-8 - pages that do not specify a charset have an indexed charset value of \"none\".", - "", - "=> /known-hosts GUS Known Hosts (with list of domains)", - "=> /statistics GUS statistics (with list of content_types)", - "", - "## Verbose Mode", - "", - "To allow greater insight into both how pages are ranking against each other, as well as when GUS crawled their content, you can enable verbose mode on any search results page. This will show the numerical score of each search result for the given query, as well as the exact time that page was crawled.", - "", - "To enable verbose mode for a given search result page, you should add a new \"v\" path component to the URL preceding the \"search\" component. Below is an example:", - "", - "=> gemini://gus.guru/search?test", - "=> gemini://gus.guru/v/search?test", - "", - "Note that verbose mode is not sticky, so navigating away from a verbose search result page will return you to non-verbose mode.", - ]) - data.extend(_render_footer()) - return Response(Status.SUCCESS, "text/gemini", "\n".join(data)) - - -@app.route("/statistics") -def index(request): - data = _render_header() - data.extend(_render_index_statistics()) - data.extend(_render_footer()) - return Response(Status.SUCCESS, "text/gemini", "\n".join(data)) - - -@app.route("/known-hosts") -def index(request): - data = _render_header() - data.extend(_render_known_hosts()) - data.extend(_render_footer()) - return Response(Status.SUCCESS, "text/gemini", "\n".join(data)) - - -@app.route("/news") -def index(request): - data = _render_header() - data.extend(_render_news()) - data.extend(_render_footer()) - return Response(Status.SUCCESS, "text/gemini", "\n".join(data)) - - -def _search_index(query, requested_page): - query = query_parser.parse(query) - results = searcher.search_page(query, requested_page, pagelen=10) - return ( - len(results), - [{ - "score" : result.score, - "indexed_at" : result["indexed_at"], - "url" : result["url"], - "fetchable_url": result["fetchable_url"], - "content_type" : result["content_type"], - "charset" : result["charset"] if "charset" in result else "none", - "size" : result["size"] if "size" in result else 0, - "prompt" : result["prompt"] if "prompt" in result else "", - "highlights" : gemini_highlighter.highlight_hit(result, "content", top=1) if "content" in result and result["content_type"] in ["text/plain", "text/gemini", "text/markdown"] else "", - } for result in results - ] - ) - - -def _get_index_modification_time(): - return datetime.fromtimestamp(os.path.getmtime("index")) - - -def _get_search_suggestions(query): - suggestions = [] - corrector = searcher.corrector("content") - for query_part in query.split(" "): - suggestions.extend(corrector.suggest(query_part, limit=3)) - return suggestions - -def _render_search_suggestions(suggestions): - data = [] - if len(suggestions) == 0: - data.append("No results!") - return data - data.append("Search suggestions:") - for suggestion in suggestions: - data.append("=> /search?{} GUS: {}".format(suggestion, suggestion)) - return data - - -def _render_results(results, verbose=False): - data = [] - for i, result in enumerate(results): - if i > 0: - data.append("") - - if result["content_type"] == "input": - prompt_suffix = ": {}".format(result["prompt"]) - link_text = "{} ({}{})".format(result["url"][9:], result["content_type"], prompt_suffix) - else: - link_text = "{} ({}, {})".format( - result["url"][9:], result["content_type"], - bytes2human(result["size"], format="%(value).0f%(symbol)s") - ) - data.append("=> {} {}".format(result["fetchable_url"], link_text)) - if verbose: - data.append("* Score : {:.2f}".format(result["score"])) - data.append("* Indexed at : {:%Y-%m-%d %H:%M}".format(result["indexed_at"])) - data.append("* Charset : {}".format(result["charset"])) - if len(result["highlights"]) > 0: - data.extend(result["highlights"].split(GeminiFormatter.between)) - return data - - -def _render_results_header(query, verbose=False): - data = [ - "", - ] - if verbose: - data.append("# Search GUS [VERBOSE]") - else: - data.append("# Search GUS") - data.extend([ - "", - "## \"{}\"".format(query), - "" - ]) - return data - - -def _render_results_footer(num_results, requested_page, query): - data = [ - "", - ] - num_pages = math.ceil(num_results / 10) - current_page = min(requested_page, num_pages) - if num_results == 0: - current_page = 0 - data.append("Page {} of {} ({} results)".format(current_page, num_pages, num_results)) - if current_page > 2: - data.append("=> /search?{} First Page".format(quote(query))) - if current_page > 1: - data.append("=> /search/{}?{} Previous Page".format(current_page - 1, quote(query))) - if current_page < num_pages: - data.append("=> /search/{}?{} Next Page".format(current_page + 1, quote(query))) - return data - - -def compute_requested_results_page(request_path): - page = 1 - p = re.compile("^(/v)?/search(/\d+)?/?") - m = p.match(request_path) - if m.group(2) is not None: - page = int(m.group(2)[1:]) - return max(page, 1) - - -def compute_verbose(request_path): - verbose = False - p = re.compile("^(/v)?/search(/\d+)?/?") - m = p.match(request_path) - if m.group(1) is not None: - verbose = True - return verbose - - -@app.route("(/v)?/search(/\d+)?") -def search(request): - data = _render_header() - if request.query: - verbose = compute_verbose(request.path) - requested_page = compute_requested_results_page(request.path) - num_results, results = _search_index(request.query, requested_page) - data.extend(_render_results_header(request.query, verbose)) - if num_results > 0: - data.extend(_render_results(results, verbose)) - else: - search_suggestions = _get_search_suggestions(request.query) - data.extend(_render_search_suggestions(search_suggestions)) - - data.extend(_render_results_footer(num_results, requested_page, request.query)) - data.extend(_render_footer()) - - return Response(Status.SUCCESS, "text/gemini", "\n".join(data)) - else: - return Response(Status.INPUT, "Search query") - - -def crawl_seed_and_restart(seed_url): - # NB: this lock will never get released under normal conditions, as the - # expected conclusion of the incremental crawl thread is issue a call - # to systemctl to restart the entire GUS serve process. That new process - # will reinitialize everything, including a fresh, unlocked Lock object. - # However, if the incremental crawl thread crashes for some reason, it - # should catch the exception and release the lock, so new seed requests - # can kick off their own incremental crawls. - with crawl_thread_lock: - run_crawl(should_run_destructive=False, seed_urls=[seed_url]) - call(["sudo", "systemctl", "restart", "gus.service"]) - - -@app.route("/add-seed") -def add_seed(request): - data = _render_header() - if request.query: - with open("seed-requests.txt", "a") as seed_file: - seed_file.write("{}\n".format(request.query)) - # crawl_thread = threading.Thread(name="crawl_thread", - # target=crawl_seed_and_restart, - # args=(request.query,)) - # crawl_thread.start() - data.extend([ - "", - # "Thank you for the addition! GUS is crawling and indexing this URL ({}) now, and it will show up in GUS search results as soon as the indexing completes.".format(request.query), - "Thank you for the addition! GUS usually completes indexing new sites within 24 hours, at which point you should see yours ({}) in GUS search results!".format(request.query), - "", - ]) - return Response(Status.SUCCESS, "text/gemini", "\n".join(data)) - else: - return Response(Status.INPUT, "Gemini URL") - - -def main(): - args = jetforce.command_line_parser().parse_args() - ssl_context = jetforce.make_ssl_context( - args.hostname, args.certfile, args.keyfile, args.cafile, args.capath - ) - server = jetforce.GeminiServer( - host=args.host, - port=args.port, - ssl_context=ssl_context, - hostname=args.hostname, - app=app, - ) - - global ix - ix = open_dir(INDEX_DIR) - - global query_parser - or_group = qparser.OrGroup.factory(0.9) - query_parser = qparser.MultifieldParser(["content", "url", "prompt"], ix.schema, group=or_group) - query_parser.add_plugin(qparser.RegexPlugin()) - query_parser.add_plugin(qparser.GtLtPlugin()) - query_parser.remove_plugin_class(qparser.WildcardPlugin) - query_parser.remove_plugin_class(qparser.BoostPlugin) - query_parser.remove_plugin_class(qparser.RangePlugin) - - global index_modification_time - index_modification_time = _get_index_modification_time() - - global searcher - with ix.searcher() as searcher: - asyncio.run(server.run()) - - -if __name__ == "__main__": - main() diff --git a/poetry.lock b/poetry.lock @@ -103,6 +103,7 @@ version = "0.1.0" reference = "47e40dcabd58ec0bf6347b1285d0a846af86f3aa" type = "git" url = "https://git.sr.ht/~natpen/gusmobile" + [[package]] category = "dev" description = "Read metadata from Python packages" @@ -183,6 +184,28 @@ python-versions = ">=3.7" version = "0.2.3" [[package]] +category = "main" +description = "A very fast and expressive template engine." +name = "jinja2" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +version = "2.11.2" + +[package.dependencies] +MarkupSafe = ">=0.23" + +[package.extras] +i18n = ["Babel (>=0.8)"] + +[[package]] +category = "main" +description = "Safely add untrusted strings to HTML/XML markup." +name = "markupsafe" +optional = false +python-versions = ">=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*" +version = "1.1.1" + +[[package]] category = "dev" description = "More routines for operating on iterables, beyond itertools" name = "more-itertools" @@ -405,7 +428,7 @@ docs = ["sphinx", "jaraco.packaging (>=3.2)", "rst.linker (>=1.9)"] testing = ["jaraco.itertools", "func-timeout"] [metadata] -content-hash = "11817402e47724214a3b96093c446f2b08571f838e35c149f037cb6c449d61c2" +content-hash = "f8438379b319f686f48f3b66cabd352c8356a59eadca1630b5c05cd42dfab04d" python-versions = "^3.7" [metadata.files] @@ -466,6 +489,45 @@ jetforce = [ {file = "Jetforce-0.2.3-py3-none-any.whl", hash = "sha256:10da5c53472dc979166982c5bd7406dcbdbcbd0716191370dcf59477faae2368"}, {file = "Jetforce-0.2.3.tar.gz", hash = "sha256:5a5e82a6aa2fb5465a40f95d4666d86cbdcaa6833dbc94cec06527faf347f24f"}, ] +jinja2 = [ + {file = "Jinja2-2.11.2-py2.py3-none-any.whl", hash = "sha256:f0a4641d3cf955324a89c04f3d94663aa4d638abe8f733ecd3582848e1c37035"}, + {file = "Jinja2-2.11.2.tar.gz", hash = "sha256:89aab215427ef59c34ad58735269eb58b1a5808103067f7bb9d5836c651b3bb0"}, +] +markupsafe = [ + {file = "MarkupSafe-1.1.1-cp27-cp27m-macosx_10_6_intel.whl", hash = "sha256:09027a7803a62ca78792ad89403b1b7a73a01c8cb65909cd876f7fcebd79b161"}, + {file = "MarkupSafe-1.1.1-cp27-cp27m-manylinux1_i686.whl", hash = "sha256:e249096428b3ae81b08327a63a485ad0878de3fb939049038579ac0ef61e17e7"}, + {file = "MarkupSafe-1.1.1-cp27-cp27m-manylinux1_x86_64.whl", hash = "sha256:500d4957e52ddc3351cabf489e79c91c17f6e0899158447047588650b5e69183"}, + {file = "MarkupSafe-1.1.1-cp27-cp27m-win32.whl", hash = "sha256:b2051432115498d3562c084a49bba65d97cf251f5a331c64a12ee7e04dacc51b"}, + {file = "MarkupSafe-1.1.1-cp27-cp27m-win_amd64.whl", hash = "sha256:98c7086708b163d425c67c7a91bad6e466bb99d797aa64f965e9d25c12111a5e"}, + {file = "MarkupSafe-1.1.1-cp27-cp27mu-manylinux1_i686.whl", hash = "sha256:cd5df75523866410809ca100dc9681e301e3c27567cf498077e8551b6d20e42f"}, + {file = "MarkupSafe-1.1.1-cp27-cp27mu-manylinux1_x86_64.whl", hash = "sha256:43a55c2930bbc139570ac2452adf3d70cdbb3cfe5912c71cdce1c2c6bbd9c5d1"}, + {file = "MarkupSafe-1.1.1-cp34-cp34m-macosx_10_6_intel.whl", hash = "sha256:1027c282dad077d0bae18be6794e6b6b8c91d58ed8a8d89a89d59693b9131db5"}, + {file = "MarkupSafe-1.1.1-cp34-cp34m-manylinux1_i686.whl", hash = "sha256:62fe6c95e3ec8a7fad637b7f3d372c15ec1caa01ab47926cfdf7a75b40e0eac1"}, + {file = "MarkupSafe-1.1.1-cp34-cp34m-manylinux1_x86_64.whl", hash = "sha256:88e5fcfb52ee7b911e8bb6d6aa2fd21fbecc674eadd44118a9cc3863f938e735"}, + {file = "MarkupSafe-1.1.1-cp34-cp34m-win32.whl", hash = "sha256:ade5e387d2ad0d7ebf59146cc00c8044acbd863725f887353a10df825fc8ae21"}, + {file = "MarkupSafe-1.1.1-cp34-cp34m-win_amd64.whl", hash = "sha256:09c4b7f37d6c648cb13f9230d847adf22f8171b1ccc4d5682398e77f40309235"}, + {file = "MarkupSafe-1.1.1-cp35-cp35m-macosx_10_6_intel.whl", hash = "sha256:79855e1c5b8da654cf486b830bd42c06e8780cea587384cf6545b7d9ac013a0b"}, + {file = "MarkupSafe-1.1.1-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:c8716a48d94b06bb3b2524c2b77e055fb313aeb4ea620c8dd03a105574ba704f"}, + {file = "MarkupSafe-1.1.1-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:7c1699dfe0cf8ff607dbdcc1e9b9af1755371f92a68f706051cc8c37d447c905"}, + {file = "MarkupSafe-1.1.1-cp35-cp35m-win32.whl", hash = "sha256:6dd73240d2af64df90aa7c4e7481e23825ea70af4b4922f8ede5b9e35f78a3b1"}, + {file = "MarkupSafe-1.1.1-cp35-cp35m-win_amd64.whl", hash = "sha256:9add70b36c5666a2ed02b43b335fe19002ee5235efd4b8a89bfcf9005bebac0d"}, + {file = "MarkupSafe-1.1.1-cp36-cp36m-macosx_10_6_intel.whl", hash = "sha256:24982cc2533820871eba85ba648cd53d8623687ff11cbb805be4ff7b4c971aff"}, + {file = "MarkupSafe-1.1.1-cp36-cp36m-manylinux1_i686.whl", hash = "sha256:00bc623926325b26bb9605ae9eae8a215691f33cae5df11ca5424f06f2d1f473"}, + {file = "MarkupSafe-1.1.1-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:717ba8fe3ae9cc0006d7c451f0bb265ee07739daf76355d06366154ee68d221e"}, + {file = "MarkupSafe-1.1.1-cp36-cp36m-win32.whl", hash = "sha256:535f6fc4d397c1563d08b88e485c3496cf5784e927af890fb3c3aac7f933ec66"}, + {file = "MarkupSafe-1.1.1-cp36-cp36m-win_amd64.whl", hash = "sha256:b1282f8c00509d99fef04d8ba936b156d419be841854fe901d8ae224c59f0be5"}, + {file = "MarkupSafe-1.1.1-cp37-cp37m-macosx_10_6_intel.whl", hash = "sha256:8defac2f2ccd6805ebf65f5eeb132adcf2ab57aa11fdf4c0dd5169a004710e7d"}, + {file = "MarkupSafe-1.1.1-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:46c99d2de99945ec5cb54f23c8cd5689f6d7177305ebff350a58ce5f8de1669e"}, + {file = "MarkupSafe-1.1.1-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:ba59edeaa2fc6114428f1637ffff42da1e311e29382d81b339c1817d37ec93c6"}, + {file = "MarkupSafe-1.1.1-cp37-cp37m-win32.whl", hash = "sha256:b00c1de48212e4cc9603895652c5c410df699856a2853135b3967591e4beebc2"}, + {file = "MarkupSafe-1.1.1-cp37-cp37m-win_amd64.whl", hash = "sha256:9bf40443012702a1d2070043cb6291650a0841ece432556f784f004937f0f32c"}, + {file = "MarkupSafe-1.1.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:6788b695d50a51edb699cb55e35487e430fa21f1ed838122d722e0ff0ac5ba15"}, + {file = "MarkupSafe-1.1.1-cp38-cp38-manylinux1_i686.whl", hash = "sha256:cdb132fc825c38e1aeec2c8aa9338310d29d337bebbd7baa06889d09a60a1fa2"}, + {file = "MarkupSafe-1.1.1-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:13d3144e1e340870b25e7b10b98d779608c02016d5184cfb9927a9f10c689f42"}, + {file = "MarkupSafe-1.1.1-cp38-cp38-win32.whl", hash = "sha256:596510de112c685489095da617b5bcbbac7dd6384aeebeda4df6025d0256a81b"}, + {file = "MarkupSafe-1.1.1-cp38-cp38-win_amd64.whl", hash = "sha256:e8313f01ba26fbbe36c7be1966a7b7424942f670f38e666995b88d012765b9be"}, + {file = "MarkupSafe-1.1.1.tar.gz", hash = "sha256:29872e92839765e546828bb7754a68c418d927cd064fd4708fab9fe9c8bb116b"}, +] more-itertools = [ {file = "more-itertools-8.3.0.tar.gz", hash = "sha256:558bb897a2232f5e4f8e2399089e35aecb746e1f9191b6584a151647e89267be"}, {file = "more_itertools-8.3.0-py3-none-any.whl", hash = "sha256:7818f596b1e87be009031c7653d01acc46ed422e6656b394b0f765ce66ed4982"}, diff --git a/pyproject.toml b/pyproject.toml @@ -10,19 +10,20 @@ python = "^3.7" gusmobile = { git = "https://git.sr.ht/~natpen/gusmobile", branch = "np/gus-hack" } whoosh = "^2.7.4" jetforce = "^0.2.0" +jinja2 = "^2.11.2" [tool.poetry.dev-dependencies] black = "^19.10b0" ipython = "^7.11.1" pytest = "^5.2" -[build-system] -requires = ["poetry>=0.12"] -build-backend = "poetry.masonry.api" - [tool.poetry.scripts] crawl = "gus.crawl:main" search_index = "gus.search_index:main" remove_domain = "gus.remove_domain:main" -serve = "gus.serve:main" +serve = "serve.main:main" statistics = "gus.lib.index_statistics:run_index_statistics" + +[build-system] +requires = ["poetry>=0.12"] +build-backend = "poetry.masonry.api" diff --git a/serve/__init__.py b/serve/__init__.py @@ -0,0 +1,3 @@ +from .views import app, gus + +__all__ = ["app", "gus"] diff --git a/serve/constants.py b/serve/constants.py @@ -0,0 +1,3 @@ +INDEX_DIR = "index" +SEED_REQUEST_FILE = "seed-requests.txt" +STATISTICS_FILE = "statistics.csv" diff --git a/serve/main.py b/serve/main.py @@ -0,0 +1,26 @@ +import asyncio + +import jetforce + +from . import app, gus + +def main(): + args = jetforce.command_line_parser().parse_args() + ssl_context = jetforce.make_ssl_context( + args.hostname, args.certfile, args.keyfile, args.cafile, args.capath + ) + server = jetforce.GeminiServer( + host=args.host, + port=args.port, + ssl_context=ssl_context, + hostname=args.hostname, + app=app, + ) + try: + asyncio.run(server.run()) + finally: + gus.searcher.close() + + +if __name__ == "__main__": + main() diff --git a/serve/models.py b/serve/models.py @@ -0,0 +1,117 @@ +import re +from urllib.parse import quote + +from whoosh import highlight, qparser +from whoosh.index import open_dir + +from . import constants +from gus.lib.index_statistics import compute_index_statistics, load_last_statistics_from_file +from gus.lib.misc import bytes2human +from gus.lib.whoosh_extensions import GeminiFormatter, GeminiScorer + +class GUS(): + def __init__(self): + self.ix = open_dir(constants.INDEX_DIR) + self.searcher = self.ix.searcher() + self.query_parser = GUS.init_query_parser(self.ix) + self.gemini_highlighter = highlight.Highlighter( + formatter=GeminiFormatter(), + fragmenter=highlight.ContextFragmenter(maxchars=160, surround=80), + scorer=GeminiScorer(), + order=highlight.SCORE, + ) + self.statistics = load_last_statistics_from_file(constants.STATISTICS_FILE) + + + def init_query_parser(ix): + or_group = qparser.OrGroup.factory(1.0) + query_parser = qparser.MultifieldParser(["content", "url", "prompt"], ix.schema, group=or_group) + query_parser.add_plugin(qparser.RegexPlugin()) + query_parser.add_plugin(qparser.GtLtPlugin()) + query_parser.remove_plugin_class(qparser.WildcardPlugin) + query_parser.remove_plugin_class(qparser.BoostPlugin) + query_parser.remove_plugin_class(qparser.RangePlugin) + return query_parser + + + def load_and_compute_statistics(filename): + statistics = load_last_statistics_from_file(filename) + + # we want fresh data for the below figures, and they aren't persisted to file + # during non-destructive crawls, so recompute them! + index_statistics = compute_index_statistics("index") + statistics["index_modification_time"] = index_statistics["index_modification_time"] + statistics["page_count"] = index_statistics["page_count"] + statistics["domain_count"] = index_statistics["domain_count"] + statistics["content_type_frequencies"] = index_statistics["content_type_frequencies"] + statistics["charset_frequencies"] = index_statistics["charset_frequencies"] + statistics["domains"] = index_statistics["domains"] + return statistics + + + def search_index(self, query, requested_page): + query = self.query_parser.parse(query) + results = self.searcher.search_page(query, requested_page, pagelen=10) + return ( + len(results), + [{ + "score" : result.score, + "indexed_at" : result["indexed_at"], + "url" : result["url"], + "fetchable_url": result["fetchable_url"], + "content_type" : result["content_type"], + "charset" : result["charset"] if "charset" in result else "none", + "size" : result["size"] if "size" in result else 0, + "prompt" : result["prompt"] if "prompt" in result else "", + "highlights" : self.gemini_highlighter.highlight_hit(result, "content", top=1) if "content" in result and result["content_type"] in ["text/plain", "text/gemini", "text/markdown"] else "", + "link_text" : GUS._get_link_text(result), + } for result in results], + ) + + + def _get_link_text(result): + if result["content_type"] == "input": + prompt_suffix = ": {}".format(result["prompt"]) + link_text = "{} ({}{})".format(result["url"][9:], result["content_type"], prompt_suffix) + else: + lang_str = ", {}".format(result["lang"]) if "lang" in result and result["lang"] != "none" else "" + link_text = "{} ({}, {})".format( + result["url"][9:], result["content_type"], + bytes2human(result["size"], format="%(value).0f%(symbol)s") + ) + return link_text + + + def get_search_suggestions(self, query): + suggestions = [] + corrector = self.searcher.corrector("content") + for query_part in query.split(" "): + query_part_suggestions = corrector.suggest(query_part, limit=3) + suggestions.extend({ + "raw": suggestion, + "quoted": quote(suggestion) + } for suggestion in query_part_suggestions) + return suggestions + + +def compute_requested_results_page(request_path): + page = 1 + p = re.compile("^(/v)?/search(/\d+)?/?") + m = p.match(request_path) + if m.group(2) is not None: + page = int(m.group(2)[1:]) + return max(page, 1) + + +def compute_verbose(request_path): + verbose = False + p = re.compile("^(/v)?/search(/\d+)?/?") + m = p.match(request_path) + if m.group(1) is not None: + verbose = True + return verbose + + +def process_seed_request(seed_request): + with open(constants.SEED_REQUEST_FILE, "a") as seed_file: + seed_file.write("{}\n".format(seed_request)) diff --git a/serve/templates/about.gmi b/serve/templates/about.gmi @@ -0,0 +1,57 @@ +{% include 'fragments/header.gmi' %} + + +# About GUS + +GUS is a search engine for all content served over the Gemini Protocol. It can help you track down textual pages (e.g., `text/gemini`, `text/plain`, `text/markdown`) with content containing your search terms, but it can just as easily help you track down binary files (e.g., images, mp3s) which happen to be served over the Gemini protocol. GUS will only index content within Geminispace, and will not index links out to other protocols, like Http or Gopher. + +To control crawling of your site, you can use a robots.txt file, Place it in your document root directory such that a request for "robots.txt" will fetch it. + +GUS obeys User-agent of "gus" and "*". + +If you have questions about or ideas for GUS, please email me at natpen@natpen.net. + +# Advanced Searching + +## Filters + +To improve the quality of your search results, you can apply filters to constrain your search results in various dimensions. The currently implemented filters are: +* content_type +* domain +* charset +* size + +To filter by one of these, simply add it to your query followed by a colon, and the value you wish to filter by. Some examples of doing so follow. + +=> /search?content_type%3Aapplication/pdf application/pdf +=> /search?content_type%3Aaudio audio +=> /search?content_type%3Aimage/jpeg image/jpeg +=> /search?content_type%3Ainput input + +=> /search?domain%3Acircumlunar domain:circumlunar +=> /search?contextual%20domain%3Agus contextual domain:gus + +=> /search?computers%20content_type%3Agemini%20AND%20NOT%20charset%3AUS-ASCII computers content_type:gemini AND NOT charset:US-ASCII +=> /search?NOT%20charset%3Anone NOT charset:none + +Note that size works slightly different than the other filters, as it is numeric. Typically, you will want to limit your search results to those less than, or greater than, a certain size. + +=> /search?computer%20AND%20size%3A%3E2000 computer AND size:>2000 + +For further inspiration on how to use these filters, you can visit both GUS' list of known hosts, as well as GUS' list of known content_types and charsets on the statistics page. Note that there is some nuance to the charset values, due to the fact that specifying them is optional, and if one does not specify, there is a default of utf-8 - pages that do not specify a charset have an indexed charset value of "none". + +=> /known-hosts GUS Known Hosts (with list of domains) +=> /statistics GUS statistics (with list of content_types) + +## Verbose Mode + +To allow greater insight into both how pages are ranking against each other, as well as when GUS crawled their content, you can enable verbose mode on any search results page. This will show the numerical score of each search result for the given query, as well as the exact time that page was crawled. + +To enable verbose mode for a given search result page, you should add a new "v" path component to the URL preceding the "search" component. Below is an example: + +=> gemini://gus.guru/search?test +=> gemini://gus.guru/v/search?test + +Note that verbose mode is not sticky, so navigating away from a verbose search result page will return you to non-verbose mode. + +{% include 'fragments/footer.gmi' %} diff --git a/serve/templates/add_seed.gmi b/serve/templates/add_seed.gmi @@ -0,0 +1,4 @@ +{% include 'fragments/header.gmi' %} + + +Thank you for the addition! GUS usually completes indexing new sites within 24 hours, at which point you should see yours ({{ seed_url }}) in GUS search results!" diff --git a/serve/templates/fragments/footer.gmi b/serve/templates/fragments/footer.gmi @@ -0,0 +1,3 @@ +=> /add-seed See any missing results? Let GUS know your gemini URL exists. + +Index updated on: {{ index_modification_time|datetimeformat }} diff --git a/serve/templates/fragments/header.gmi b/serve/templates/fragments/header.gmi @@ -0,0 +1,4 @@ +# GUS - Gemini Universal Search + +=> / Home +=> /search Search GUS diff --git a/serve/templates/fragments/pager.gmi b/serve/templates/fragments/pager.gmi @@ -0,0 +1,10 @@ +Page {{current_page}} of {{num_pages}} ({{num_results}} results) +{% if current_page > 2 %} +=> /search?{{ quoted_query }} First Page +{% endif %} +{% if current_page > 1 %} +=> /search/{{ current_page - 1 }}?{{ quoted_query }} Previous Page +{% endif %} +{% if current_page < num_pages %} +=> /search/{{ current_page + 1}}?{{ quoted_query }} Next Page +{% endif %} diff --git a/serve/templates/index.gmi b/serve/templates/index.gmi @@ -0,0 +1,9 @@ +{% include 'fragments/header.gmi' %} + +=> /about About GUS +=> /statistics GUS Statistics +=> /known-hosts Known Gemini Hosts +=> /news GUS News +=> gemini://gemini.circumlunar.space Gemini Project information + +{% include 'fragments/footer.gmi' %} diff --git a/serve/templates/known_hosts.gmi b/serve/templates/known_hosts.gmi @@ -0,0 +1,12 @@ +{% include 'fragments/header.gmi' %} + + +# Known Gemini Hosts + +Below are the hosts, or servers, in Geminispace, of which GUS is aware. Note that this list is auto-generated from the index, so if your host is not showing up here, it also won't have its content represented in GUS search results! If your server is missing, try the link at the bottom of this page to request an immediate crawl of a Gemini URL, after which your server should start showing up. + +{% for host in known_hosts %} +{{ "=> gemini://{} {}".format(host, host) }} +{% endfor %} + +{% include 'fragments/footer.gmi' %} diff --git a/serve/templates/news.gmi b/serve/templates/news.gmi @@ -0,0 +1,40 @@ +{% include 'fragments/header.gmi' %} + + +## 2020-06-12 +Added size of pages to each result, so users can see how much network bandwidth would be involved in following each link. Size is also provided as a valid query search filter now. Documentation for filtering by size can be found on the advanced searching section of the about page! + +## 2020-06-03 +Added ability to search and filter by charset. Documentation for this feature can be found on the advanced searching section of the about page! + +## 2020-05-21 +Added ability to search and filter by domain. Documentation for this feature can be found on the advanced searching section of the about page! + +## 2020-05-21 +Added contextual highlights from result pages' content directly to GUS search results pages. This should give a nice preview of what to expect if you click through to the result! + +## 2020-05-19 +Added instantaneous indexing of seed requests! Submit a seed request and your site should begin showing up in closer to a few minutes than a few days, as it tended to before. + +## 2020-05-16 +Added verbose search result mode. This is meant as an aid to content creators in figuring out exactly when GUS crawled specific pages, and how those pages' content scores against other results in Geminispace. Documentation for this feature is available under the advanced searching section of the about page. + +## 2020-05-14 +Added paging functionality, which will now show up at the bottom of search result pages that have more than one page of results. + +## 2020-05-12 +Added Known Hosts and News, both of which you can find on the GUS homepage. + +## 2020-05-11 +Added Index Statistics feature, which you can find on the GUS homepage. + +## 2020-05-09 +Added ability to search and filter by content type. Documentation for this feature can be found on the about page! I'm really excited about this update, because it will allow users to do things like search for all the music in Geminispace, or search for all the downloadable epub content. Unsurprisingly, most of the current content is either `text/gemini` or `text/plain`, but there is still a smattering of other interesting content types already in existence in Geminispace. + +## 2020-03-04 +Added search suggestions. When a search returns with no results, GUS will now attempt to find some lexicographically similar search suggestions that would yield results. Hopefully this will help ameliorate both typos in queries, as well as the slight paucity of content in these early days of Gemini. + +## 2020-02-21 +Updated GUS indexing to respect robots.txt. Documentation for this feature can be found on the about page.p + +{% include 'fragments/footer.gmi' %} diff --git a/serve/templates/search.gmi b/serve/templates/search.gmi @@ -0,0 +1,28 @@ +{% include 'fragments/header.gmi' %} + + +# Search GUS + +"{{ query }}" + +{% for result in results %} +=> {{ result["fetchable_url"] }} {{ result["link_text"] }} +{% if verbose %} +* Score : {{ "{:.2f}".format(result["score"]) }} +* Indexed at : {{ "{:%Y-%m-%d %H:%M}".format(result["indexed_at"]) }} +* Charset : {{ result["charset"] }} +{% endif %} +{% if result["highlights"] | length > 0 %} +{{ result["highlights"] }} +{% endif %} + +{% else %} +No results! +{% for suggestion in search_suggestions %} +=> /search?{{ suggestion }} GUS: {{ suggestion }} +{% endfor%} + +{% endfor %} +{% include 'fragments/pager.gmi' %} + +{% include 'fragments/footer.gmi' %} diff --git a/serve/templates/search_suggestions.gmi b/serve/templates/search_suggestions.gmi @@ -0,0 +1,12 @@ +{% include 'fragments/header.gmi' %} + + +# Search GUS + +No results for "{{ query }}" + +{% for suggestion in search_suggestions %} +=> /search?{{ suggestion["quoted"] }} Suggested query: "{{ suggestion["raw"] }}" +{% endfor%} + +{% include 'fragments/footer.gmi' %} diff --git a/serve/templates/statistics.gmi b/serve/templates/statistics.gmi @@ -0,0 +1,35 @@ +{% include 'fragments/header.gmi' %} + + +# GUS Statistics + +## Overall + +These figures are representative of the aggregate size of Geminispace at the time the current index was last updated on {{ index_modification_time }}. + +``` +Page Count : {{ "{:>5}".format(statistics.page_count) }} +Domain Count : {{ "{:>5}".format(statistics.domain_count) }} +``` + +## By Content Type + +These figures are representative of the number of pages seen per content type at the time the current index was last updated on {{ index_modification_time }}. + +``` +{% for pair in statistics.content_type_frequencies %} +{{ "{:>5} - {}".format(pair[1], pair[0]) }} +{% endfor %} +``` + +## By Charset + +These figures are representative of the number of pages seen per content type at the time the current index was last updated on {{ index_modification_time }}. + +``` +{% for pair in statistics.charset_frequencies %} +{{ "{:>5} - {}".format(pair[1], pair[0]) }} +{% endfor %} +``` + +{% include 'fragments/footer.gmi' %} diff --git a/serve/views.py b/serve/views.py @@ -0,0 +1,110 @@ +import math +import os +from datetime import datetime +from urllib.parse import quote + +import jinja2 +from jetforce import Request, Response, Status, JetforceApplication + +from .models import compute_verbose, compute_requested_results_page, GUS, process_seed_request + +TEMPLATE_DIR = os.path.join(os.path.dirname(__file__), "templates") + +template_env = jinja2.Environment( + loader=jinja2.FileSystemLoader(TEMPLATE_DIR), + undefined=jinja2.StrictUndefined, + trim_blocks=True, + lstrip_blocks=True, +) + +def datetimeformat(value, format="%Y-%m-%d"): + return value.strftime(format) + +template_env.filters['datetimeformat'] = datetimeformat + +def render_template(name: str, *args, **kwargs) -> str: + """ + Render a gemini directory using the Jinja2 template engine. + """ + return template_env.get_template(name).render(*args, **kwargs) + +app = JetforceApplication() +gus = GUS() + +@app.route("/add-seed", strict_trailing_slash=False) +def add_seed(request): + if request.query: + process_seed_request(request.query) + body = render_template("add_seed.gmi", seed_url=request.query) + return Response(Status.SUCCESS, "text/gemini", body) + else: + return Response(Status.INPUT, "Gemini URL") + + +@app.route("/statistics", strict_trailing_slash=False) +def statistics(request): + body = render_template("statistics.gmi", + statistics=gus.statistics, + index_modification_time=gus.statistics["index_modification_time"]) + return Response(Status.SUCCESS, "text/gemini", body) + + +@app.route("/known-hosts", strict_trailing_slash=False) +def known_hosts(request): + body = render_template("known_hosts.gmi", + # TODO: remove this `sorted` after the next index generation + known_hosts=sorted(gus.statistics["domains"]), + index_modification_time=gus.statistics["index_modification_time"]) + return Response(Status.SUCCESS, "text/gemini", body) + + +@app.route("", strict_trailing_slash=False) +def index(request): + body = render_template("index.gmi", + index_modification_time=gus.statistics["index_modification_time"]) + return Response(Status.SUCCESS, "text/gemini", body) + + +@app.route("/about", strict_trailing_slash=False) +def index(request): + body = render_template("about.gmi", + index_modification_time=gus.statistics["index_modification_time"]) + return Response(Status.SUCCESS, "text/gemini", body) + + +@app.route("/news", strict_trailing_slash=False) +def index(request): + body = render_template("news.gmi", + index_modification_time=gus.statistics["index_modification_time"]) + return Response(Status.SUCCESS, "text/gemini", body) + + +@app.route("(/v)?/search(/\d+)?") +def search(request): + if request.query: + verbose = compute_verbose(request.path) + requested_page = compute_requested_results_page(request.path) + num_results, results = gus.search_index(request.query, requested_page) + if num_results > 0: + num_pages = math.ceil(num_results / 10) + current_page = min(requested_page, num_pages) + if num_results == 0: + current_page = 0 + body = render_template("search.gmi", + query=request.query, + quoted_query=quote(request.query), + verbose=verbose, + num_results=num_results, + results=results, + current_page=current_page, + num_pages=num_pages, + index_modification_time=gus.statistics["index_modification_time"]) + else: + search_suggestions = gus.get_search_suggestions(request.query) + body = render_template("search_suggestions.gmi", + query=request.query, + search_suggestions=search_suggestions, + index_modification_time=gus.statistics["index_modification_time"]) + return Response(Status.SUCCESS, "text/gemini", body) + else: + return Response(Status.INPUT, "Search query")