geminispace.info

gemini search engine
git clone https://git.clttr.info/geminispace.info.git
Log (Feed) | Files | Refs (Tags) | README | LICENSE

commit d7518388dd0fa358d7ae454d3feeb1fb7e6c2aa7
parent 2c7edac9e1fef61fb30f584cbe9700a51b5dec5f
Author: Natalie Pendragon <natpen@natpen.net>
Date:   Fri,  3 Jul 2020 10:39:56 -0400

[crawl] [serve] Switch crawl to 2-phase with sqlite

Diffstat:
Agus/build_index.py | 221+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mgus/crawl.py | 266+++++++++++++++++++++++++++++++++----------------------------------------------
Mgus/lib/index_statistics.py | 15++++++++-------
Mpoetry.lock | 39++++++++++++++++++++++++++++++++++-----
Mpyproject.toml | 3+++
Mserve/models.py | 17+----------------
Mserve/templates/about.gmi | 6+++---
7 files changed, 382 insertions(+), 185 deletions(-)

diff --git a/gus/build_index.py b/gus/build_index.py @@ -0,0 +1,221 @@ +import argparse +from datetime import datetime, timedelta +import os +import pathlib +import pickle +import re +import shutil +import time +from urllib.parse import urljoin, uses_relative, uses_netloc + +import gusmobile as gemini +from peewee import ( + BooleanField, + DateTimeField, + DoesNotExist, + FloatField, + ForeignKeyField, + IntegerField, + Model, + SqliteDatabase, + TextField, +) +from whoosh.analysis import FancyAnalyzer +from whoosh.fields import Schema, TEXT, DATETIME, STORED, ID, NUMERIC, KEYWORD +from whoosh.filedb.filestore import FileStorage +from whoosh.index import create_in, open_dir +from whoosh.query import Every +from whoosh.qparser import QueryParser +from whoosh.writing import BufferedWriter + +from gus.lib.index_statistics import compute_index_statistics, persist_statistics, print_index_statistics +from gus.lib.whoosh_extensions import UrlAnalyzer +from gus.lib.gemini import GeminiResource, GeminiRobotFileParser + +# hack(natpen): the built-in methods in urllib need to know the +# Gemini protocol exists +uses_relative.append("gemini") +uses_netloc.append("gemini") + +INDEX_DIR_CURRENT = "index" +# INDEX_DIR_BACKUP = INDEX_DIR_CURRENT + ".bak" +INDEX_DIR_NEW = INDEX_DIR_CURRENT + ".new" + +# def backup_old_index(index_dir, backup_dir): +# last_index_modification_time = datetime.fromtimestamp(os.path.getmtime(index_dir)) +# print("Backing up last index from {:%Y-%m-%d}...".format(last_index_modification_time)) +# print("--------------------------") +# backup_index_dir = backup_dir + "/{:%Y-%m-%d}".format(last_index_modification_time) +# shutil.rmtree(backup_index_dir, ignore_errors=True) +# shutil.copytree(index_dir, backup_index_dir) + + +def create_index(index_dir): + # shutil.rmtree(index_dir, ignore_errors=True) + pathlib.Path(index_dir).mkdir(parents=True, exist_ok=True) + schema = Schema( + url_id=ID( + unique=True, + ), + url=TEXT( + field_boost=2.0, + stored=True, + analyzer=UrlAnalyzer(), + ), + fetchable_url=STORED(), + domain=TEXT( + analyzer=UrlAnalyzer(), + ), + port=NUMERIC( + int, + 32, + signed=False, + stored=True, + ), + content_type=TEXT( + stored=True, + ), + charset=ID( + stored=True, + ), + lang=ID( + stored=True, + ), + content=TEXT( + analyzer=FancyAnalyzer(), + spelling=True, + stored=True, + ), + prompt=TEXT( + analyzer=FancyAnalyzer(), + stored=True, + ), + size=NUMERIC( + int, + # this means GUS will have problems indexing responses over ~2GB + 32, + signed=False, + stored=True, + ), + backlinks=KEYWORD( + stored=True, + ), + indexed_at=DATETIME( + stored=True, + ), + ) + index_storage.create_index(schema) + + +def index_page(page): + backlinks = (Page + .select() + .join(Link, on=Link.from_page) + .where(Link.to_page == page)) + document = { + "url_id": page.url, + "url": page.url, + "fetchable_url": page.fetchable_url, + "domain": page.domain, + "port": page.port, + "content_type": page.content_type, + "charset": page.charset or "none", + "lang": page.lang, + "size": page.size, + "indexed_at": page.indexed_at, + "backlinks": " ".join([b.url for b in backlinks]), + "prompt": page.prompt, + "content": page.content, + } + index_writer.add_document(**document) + + +class Page(Model): + """ + All the pages crawled in Geminispace + """ + + url = TextField(unique=True, index=True) + fetchable_url = TextField(null=True) + domain = TextField(null=True) + port = IntegerField(null=True) + content_type = TextField(null=True) + charset = TextField(null=True) + # TODO: normalize lang out to handle multiple values better + lang = TextField(null=True) + content = TextField(null=True) + prompt = TextField(null=True) + size = IntegerField(null=True) # in bytes + indexed_at = DateTimeField(null=True) + + +class Link(Model): + """ + Hyperlinks between pages in Geminispace + """ + + from_page = ForeignKeyField(Page, backref="outbound_links") + to_page = ForeignKeyField(Page, backref="backlinks") + + +def init_db(filename=":memory:"): + """ + Bind an SQLite database to the Peewee ORM models. + """ + models = [Page, Link] + db = SqliteDatabase(filename) + db.bind(models) + db.create_tables(models) + return db + + +def build_index(should_run_destructive=False): + global index_dir + index_dir = INDEX_DIR_NEW if should_run_destructive else INDEX_DIR_CURRENT + global index_storage + index_storage = FileStorage(index_dir) + if should_run_destructive: + # backup_old_index(INDEX_DIR_CURRENT, INDEX_DIR_BACKUP) + create_index(index_dir) + global db + db = init_db(index_dir + "/gus.sqlite") + global ix + ix = index_storage.open_index() + global index_writer + index_writer = ix.writer() + + pages = Page.select().where(Page.indexed_at.is_null(False)) + for page in pages.iterator(): + index_page(page) + index_writer.commit() + + index_statistics = compute_index_statistics(index_dir) + print_index_statistics(index_statistics) + persist_statistics(index_statistics, None, should_run_destructive, "statistics.csv") + # if should_run_destructive: + # # replace current index with new index + # shutil.rmtree(INDEX_DIR_CURRENT, ignore_errors=True) + # shutil.move(INDEX_DIR_NEW, INDEX_DIR_CURRENT) + + +def main(): + args = parse_args() + build_index(args.should_run_destructive) + + +def parse_args(): + parser = argparse.ArgumentParser(description='Crawl Geminispace.') + parser.add_argument( + "--destructive", + "-d", + dest="should_run_destructive", + action="store_true", + default=False, + help="create a fresh index and perform a full Geminispace crawl", + ) + args = parser.parse_args() + return args + + +if __name__ == "__main__": + main() diff --git a/gus/crawl.py b/gus/crawl.py @@ -9,16 +9,18 @@ import time from urllib.parse import urljoin, uses_relative, uses_netloc import gusmobile as gemini -from whoosh.analysis import FancyAnalyzer -from whoosh.fields import Schema, TEXT, DATETIME, STORED, ID, NUMERIC, KEYWORD -from whoosh.filedb.filestore import FileStorage -from whoosh.index import create_in, open_dir -from whoosh.query import Every -from whoosh.qparser import QueryParser -from whoosh.writing import BufferedWriter - -from gus.lib.index_statistics import compute_index_statistics, persist_statistics, print_index_statistics -from gus.lib.whoosh_extensions import UrlAnalyzer +from peewee import ( + BooleanField, + DateTimeField, + DoesNotExist, + FloatField, + ForeignKeyField, + IntegerField, + Model, + SqliteDatabase, + TextField, +) + from gus.lib.gemini import GeminiResource, GeminiRobotFileParser # hack(natpen): the built-in methods in urllib need to know the @@ -176,139 +178,84 @@ CRAWL_DELAYS = { } -def backup_old_index(index_dir, backup_dir): - last_index_modification_time = datetime.fromtimestamp(os.path.getmtime(index_dir)) - print("Backing up last index from {:%Y-%m-%d}...".format(last_index_modification_time)) - print("--------------------------") - backup_index_dir = backup_dir + "/{:%Y-%m-%d}".format(last_index_modification_time) - shutil.rmtree(backup_index_dir, ignore_errors=True) - shutil.copytree(index_dir, backup_index_dir) - - -def create_index(index_dir): - shutil.rmtree(index_dir, ignore_errors=True) - pathlib.Path(index_dir).mkdir(parents=True, exist_ok=True) - schema = Schema( - url_id=ID( - unique=True, - ), - url=TEXT( - field_boost=2.0, - stored=True, - analyzer=UrlAnalyzer(), - ), - fetchable_url=STORED(), - domain=TEXT( - analyzer=UrlAnalyzer(), - ), - content_type=TEXT( - stored=True, - ), - charset=ID( - stored=True, - ), - lang=ID( - stored=True, - ), - content=TEXT( - analyzer=FancyAnalyzer(), - spelling=True, - stored=True, - ), - prompt=TEXT( - analyzer=FancyAnalyzer(), - stored=True, - ), - size=NUMERIC( - int, - # this means GUS will have problems indexing responses over ~2GB - 32, - signed=False, - stored=True, - ), - backlinks=KEYWORD( - stored=True, - ), - indexed_at=DATETIME( - stored=True, - ), - ) - index_storage.create_index(schema) - - def index_binary(resource, response): print("INDEXING BINARY...") - with index_writer.searcher() as searcher: - result = searcher.document(url_id=resource.indexable_url) - update_document( - result, - url_id=resource.indexable_url, - url=resource.indexable_url, - fetchable_url=resource.fetchable_url, - domain=resource.normalized_host, - content_type=response.content_type, - charset=response.charset or "none", - size=response.num_bytes, - indexed_at=datetime.utcnow(), - ) + doc = { + "url": resource.indexable_url, + "fetchable_url": resource.fetchable_url, + "domain": resource.normalized_host, + "port": resource.urlsplit.port or 1965, + "content_type": response.content_type, + "charset": response.charset, + "size": response.num_bytes, + "indexed_at": datetime.utcnow(), + } + existing_page = Page.get_or_none(url=resource.indexable_url) + if existing_page: + doc["id"] = existing_page.id + page = Page(**doc) + page.save() def index_prompt(resource, response): print("INDEXING PROMPT...") - with index_writer.searcher() as searcher: - result = searcher.document(url_id=resource.indexable_url) - update_document( - result, - url_id=resource.indexable_url, - url=resource.indexable_url, - fetchable_url=resource.fetchable_url, - domain=resource.normalized_host, - content_type="input", - charset=response.charset or "none", - size=response.num_bytes, - prompt=response.prompt, - indexed_at=datetime.utcnow(), - ) + doc = { + "url": resource.indexable_url, + "fetchable_url": resource.fetchable_url, + "domain": resource.normalized_host, + "port": resource.urlsplit.port or 1965, + "content_type": "input", + "charset": response.charset, + "size": response.num_bytes, + "prompt": response.prompt, + "indexed_at": datetime.utcnow(), + } + existing_page = Page.get_or_none(url=resource.indexable_url) + if existing_page: + doc["id"] = existing_page.id + page = Page(**doc) + page.save() def index_content(resource, response): print("INDEXING CONTENT...") doc = { - "url_id": resource.indexable_url, "url": resource.indexable_url, "fetchable_url": resource.fetchable_url, "domain": resource.normalized_host, + "port": resource.urlsplit.port or 1965, "content_type": response.content_type, - "charset": response.charset or "none", + "charset": response.charset, "content": response.content, "size": response.num_bytes, "indexed_at": datetime.utcnow(), } if response.content_type == "text/gemini": doc["lang"] = response.lang or "none", - with index_writer.searcher() as searcher: - result = searcher.document(url_id=resource.indexable_url) - update_document(result, **doc) + existing_page = Page.get_or_none(url=resource.indexable_url) + if existing_page: + doc["id"] = existing_page.id + page = Page(**doc) + page.save() -def index_backlinks(resource, contained_resources): +def index_links(from_resource, contained_resources): + from_page, created = Page.get_or_create(url=from_resource.indexable_url) + data = [] for cr in contained_resources: - with ix.searcher() as searcher: - result = searcher.document(url_id=resource.indexable_url) - backlinks = set() - if result and "backlinks" in result: - backlinks = set(result["backlinks"].split()) - backlinks.add(resource.fetchable_url) - update_document(result, url_id=cr.indexable_url, backlinks=" ".join(backlinks)) - - -def update_document(document, **kwargs): - if not document: - document = {} - # pdb.set_trace() - for key, value in kwargs.items(): - document[key] = value - index_writer.update_document(**document) + to_page = Page.get_or_none(url=cr.indexable_url) + if not to_page: + to_page = Page.create( + url=cr.indexable_url, + fetchable_url=cr.fetchable_url, + domain=cr.normalized_host, + port=cr.urlsplit.port or 1965, + ) + data.append({ + "from_page": from_page, + "to_page": to_page, + }) + Link.insert_many(data).execute() def get_robots_file(robot_host): @@ -386,7 +333,6 @@ def crawl(gemini_resource): # problem before getting a response print("ERROR : %s" % gr.fetchable_url) print("--------------------------") - crawl_statistics["broken_url_count"] += 1 elif response.status.startswith("3"): # redirect status print("REDIRECT : %s -> %s" % (gr.fetchable_url, response.url)) @@ -395,9 +341,6 @@ def crawl(gemini_resource): # of a trailing slash), then the crawl of the redirect would think it had # already seen this resource in visited_urls' normalized source of truth. visited_urls.pop() - crawl_statistics["redirect_count"] += 1 - # if is_nontrivial_redirect(gr.fetchable_url, r.url): - # crawl_statistics["redirect_nontrivial_count"] += 1 redirect_resource = GeminiResource(response.url, gr.normalized_url, gr.normalized_host) crawl(redirect_resource) elif response.status.startswith("1"): @@ -418,7 +361,7 @@ def crawl(gemini_resource): print("Extracting contained resources...") print("--------------------------") contained_resources = gr.extract_contained_resources(response.content) - index_backlinks(gr, contained_resources) + index_links(gr, contained_resources) for resource in contained_resources: crawl(resource) else: @@ -437,11 +380,7 @@ def is_nontrivial_redirect(url, redirect_url): def load_visited_urls(index_dir): - visited_urls = [] - ix = open_dir(index_dir) - with ix.reader() as reader: - all_stored_fields = reader.all_stored_fields() - visited_urls = [GeminiResource(f["url"]).normalized_url for f in all_stored_fields] + visited_urls = [GeminiResource(p.url).normalized_url for p in Page.select()] return visited_urls @@ -464,20 +403,53 @@ def load_seed_request_urls(): return content +class Page(Model): + """ + All the pages crawled in Geminispace + """ + + url = TextField(unique=True, index=True) + fetchable_url = TextField(null=True) + domain = TextField(null=True) + port = IntegerField(null=True) + content_type = TextField(null=True) + charset = TextField(null=True) + # TODO: normalize lang out to handle multiple values better + lang = TextField(null=True) + content = TextField(null=True) + prompt = TextField(null=True) + size = IntegerField(null=True) # in bytes + indexed_at = DateTimeField(null=True) + + +class Link(Model): + """ + Hyperlinks between pages in Geminispace + """ + + from_page = ForeignKeyField(Page, backref="outbound_links") + to_page = ForeignKeyField(Page, backref="backlinks") + + +def init_db(filename=":memory:"): + """ + Bind an SQLite database to the Peewee ORM models. + """ + models = [Page, Link] + db = SqliteDatabase(filename) + db.bind(models) + db.create_tables(models) + return db + + def run_crawl(should_run_destructive=False, seed_urls=[]): # TODO: track failed domain/page attempts, and don't reattempt for 15mins global index_dir index_dir = INDEX_DIR_NEW if should_run_destructive else INDEX_DIR_CURRENT - global index_storage - index_storage = FileStorage(index_dir) - if should_run_destructive: - backup_old_index(INDEX_DIR_CURRENT, INDEX_DIR_BACKUP) - create_index(index_dir) - global ix - ix = index_storage.open_index() - global index_writer - index_writer = BufferedWriter(ix, period=120, limit=1) + pathlib.Path(index_dir).mkdir(parents=True, exist_ok=True) + global db + db = init_db(index_dir + "/gus.sqlite") global visited_urls visited_urls = [] if should_run_destructive else load_visited_urls(INDEX_DIR_CURRENT) @@ -485,14 +457,7 @@ def run_crawl(should_run_destructive=False, seed_urls=[]): robot_file_map = {} if should_run_destructive else unpickle_robot_file_map(INDEX_DIR_CURRENT) global domain_hit_timings domain_hit_timings = {} - global crawl_statistics - crawl_statistics = { - # any redirect counts - "redirect_count": 0, - # more than just adding/removing trailing slash - "redirect_nontrivial_count": 0, - "broken_url_count": 0, - } + seed_urls.extend(SEED_URLS) seed_resources = [GeminiResource(url) for url in seed_urls] for resource in seed_resources: @@ -504,17 +469,10 @@ def run_crawl(should_run_destructive=False, seed_urls=[]): seed_request_resources = [GeminiResource(url) for url in seed_request_urls] for resource in seed_request_resources: crawl(resource) - index_writer.close() pickle_robot_file_map(robot_file_map, index_dir) - index_statistics = compute_index_statistics(index_dir) - print_index_statistics(index_statistics, crawl_statistics) - persist_statistics(index_statistics, crawl_statistics, should_run_destructive, "statistics.csv") - if should_run_destructive: - # replace current index with new index - shutil.rmtree(INDEX_DIR_CURRENT, ignore_errors=True) - shutil.move(INDEX_DIR_NEW, INDEX_DIR_CURRENT) + print("Finished!") def main(): diff --git a/gus/lib/index_statistics.py b/gus/lib/index_statistics.py @@ -66,13 +66,14 @@ def compute_index_statistics(index_dir): } -def print_index_statistics(index_statistics, crawl_statistics): +def print_index_statistics(index_statistics, crawl_statistics=None): print("Index generation date : {:%Y-%m-%d}".format(index_statistics["index_modification_time"])) print("Page Count : {:>6}".format(index_statistics["page_count"])) print("Domain Count : {:>6}".format(index_statistics["domain_count"])) - print("Redirect count : {:>6}".format(crawl_statistics["redirect_count"])) - print("Nontrivial redirect count : {:>6}".format(crawl_statistics["redirect_nontrivial_count"])) - print("Broken URL count : {:>6}".format(crawl_statistics["broken_url_count"])) + if crawl_statistics: + print("Redirect count : {:>6}".format(crawl_statistics["redirect_count"])) + print("Nontrivial redirect count : {:>6}".format(crawl_statistics["redirect_nontrivial_count"])) + print("Broken URL count : {:>6}".format(crawl_statistics["broken_url_count"])) print("Domains : {}".format(index_statistics["domains"])) # for domain in index_statistics["domains"]: @@ -104,9 +105,9 @@ def serialize_statistics_line(index_statistics, crawl_statistics, was_destructiv was_destructive, index_statistics["page_count"], index_statistics["domain_count"], - crawl_statistics["redirect_count"], - crawl_statistics["redirect_nontrivial_count"], - crawl_statistics["broken_url_count"], + crawl_statistics["redirect_count"] if crawl_statistics else 0, + crawl_statistics["redirect_nontrivial_count"] if crawl_statistics else 0, + crawl_statistics["broken_url_count"] if crawl_statistics else 0, "|".join(index_statistics["domains"]), "|".join("{}:{}".format(pair[0], pair[1]) for pair in index_statistics["content_type_frequencies"]), "|".join("{}:{}".format(pair[0], pair[1]) for pair in index_statistics["charset_frequencies"]), diff --git a/poetry.lock b/poetry.lock @@ -9,7 +9,7 @@ version = "1.4.4" [[package]] category = "dev" description = "Disable App Nap on OS X 10.9" -marker = "sys_platform == \"darwin\"" +marker = "python_version >= \"3.4\" and sys_platform == \"darwin\" or sys_platform == \"darwin\"" name = "appnope" optional = false python-versions = "*" @@ -77,7 +77,7 @@ version = "7.1.2" [[package]] category = "dev" description = "Cross-platform colored terminal text." -marker = "sys_platform == \"win32\"" +marker = "python_version >= \"3.4\" and sys_platform == \"win32\" or sys_platform == \"win32\"" name = "colorama" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" @@ -122,6 +122,21 @@ testing = ["packaging", "pep517", "importlib-resources (>=1.3)"] [[package]] category = "dev" +description = "IPython-enabled pdb" +name = "ipdb" +optional = false +python-versions = ">=2.7" +version = "0.13.3" + +[package.dependencies] +setuptools = "*" + +[package.dependencies.ipython] +python = ">=3.4" +version = ">=5.1.0" + +[[package]] +category = "dev" description = "IPython: Productive Interactive Computing" name = "ipython" optional = false @@ -245,9 +260,17 @@ python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" version = "0.8.0" [[package]] +category = "main" +description = "a little orm" +name = "peewee" +optional = false +python-versions = "*" +version = "3.13.3" + +[[package]] category = "dev" description = "Pexpect allows easy control of interactive console applications." -marker = "sys_platform != \"win32\"" +marker = "python_version >= \"3.4\" and sys_platform != \"win32\" or sys_platform != \"win32\"" name = "pexpect" optional = false python-versions = "*" @@ -294,7 +317,7 @@ wcwidth = "*" [[package]] category = "dev" description = "Run a subprocess in a pseudo terminal" -marker = "sys_platform != \"win32\"" +marker = "python_version >= \"3.4\" and sys_platform != \"win32\" or sys_platform != \"win32\"" name = "ptyprocess" optional = false python-versions = "*" @@ -428,7 +451,7 @@ docs = ["sphinx", "jaraco.packaging (>=3.2)", "rst.linker (>=1.9)"] testing = ["jaraco.itertools", "func-timeout"] [metadata] -content-hash = "f8438379b319f686f48f3b66cabd352c8356a59eadca1630b5c05cd42dfab04d" +content-hash = "de8df694bd7d989863ed4249d3854b696f8f7702aef0c5280ca5d799d02512b9" python-versions = "^3.7" [metadata.files] @@ -473,6 +496,9 @@ importlib-metadata = [ {file = "importlib_metadata-1.6.1-py2.py3-none-any.whl", hash = "sha256:15ec6c0fd909e893e3a08b3a7c76ecb149122fb14b7efe1199ddd4c7c57ea958"}, {file = "importlib_metadata-1.6.1.tar.gz", hash = "sha256:0505dd08068cfec00f53a74a0ad927676d7757da81b7436a6eefe4c7cf75c545"}, ] +ipdb = [ + {file = "ipdb-0.13.3.tar.gz", hash = "sha256:d6f46d261c45a65e65a2f7ec69288a1c511e16206edb2875e7ec6b2f66997e78"}, +] ipython = [ {file = "ipython-7.15.0-py3-none-any.whl", hash = "sha256:1b85d65632211bf5d3e6f1406f3393c8c429a47d7b947b9a87812aa5bce6595c"}, {file = "ipython-7.15.0.tar.gz", hash = "sha256:0ef1433879816a960cd3ae1ae1dc82c64732ca75cec8dab5a4e29783fb571d0e"}, @@ -544,6 +570,9 @@ pathspec = [ {file = "pathspec-0.8.0-py2.py3-none-any.whl", hash = "sha256:7d91249d21749788d07a2d0f94147accd8f845507400749ea19c1ec9054a12b0"}, {file = "pathspec-0.8.0.tar.gz", hash = "sha256:da45173eb3a6f2a5a487efba21f050af2b41948be6ab52b6a1e3ff22bb8b7061"}, ] +peewee = [ + {file = "peewee-3.13.3.tar.gz", hash = "sha256:1269a9736865512bd4056298003aab190957afe07d2616cf22eaf56cb6398369"}, +] pexpect = [ {file = "pexpect-4.8.0-py2.py3-none-any.whl", hash = "sha256:0b48a55dcb3c05f3329815901ea4fc1537514d6ba867a152b581d69ae3710937"}, {file = "pexpect-4.8.0.tar.gz", hash = "sha256:fc65a43959d153d0114afe13997d439c22823a27cefceb5ff35c2178c6784c0c"}, diff --git a/pyproject.toml b/pyproject.toml @@ -11,14 +11,17 @@ gusmobile = { git = "https://git.sr.ht/~natpen/gusmobile", branch = "np/gus-hack whoosh = "^2.7.4" jetforce = "^0.2.0" jinja2 = "^2.11.2" +peewee = "^3.13.3" [tool.poetry.dev-dependencies] black = "^19.10b0" ipython = "^7.11.1" pytest = "^5.2" +ipdb = "^0.13.3" [tool.poetry.scripts] crawl = "gus.crawl:main" +build_index = "gus.build_index:main" search_index = "gus.search_index:main" remove_domain = "gus.remove_domain:main" serve = "serve.main:main" diff --git a/serve/models.py b/serve/models.py @@ -21,7 +21,7 @@ class GUS(): scorer=GeminiScorer(), order=highlight.SCORE, ) - self.statistics = load_last_statistics_from_file(constants.STATISTICS_FILE) + self.statistics = compute_index_statistics(constants.INDEX_DIR) def init_query_parser(ix): @@ -35,21 +35,6 @@ class GUS(): return query_parser - def load_and_compute_statistics(filename): - statistics = load_last_statistics_from_file(filename) - - # we want fresh data for the below figures, and they aren't persisted to file - # during non-destructive crawls, so recompute them! - index_statistics = compute_index_statistics("index") - statistics["index_modification_time"] = index_statistics["index_modification_time"] - statistics["page_count"] = index_statistics["page_count"] - statistics["domain_count"] = index_statistics["domain_count"] - statistics["content_type_frequencies"] = index_statistics["content_type_frequencies"] - statistics["charset_frequencies"] = index_statistics["charset_frequencies"] - statistics["domains"] = index_statistics["domains"] - return statistics - - def search_index(self, query, requested_page): query = self.query_parser.parse(query) results = self.searcher.search_page(query, requested_page, pagelen=10) diff --git a/serve/templates/about.gmi b/serve/templates/about.gmi @@ -54,12 +54,12 @@ There is a button at the top of each search results page to toggle verbose mode Note that verbose mode is sticky, and will persist between pages of results results, so you will need to manually toggle verbose mode off when you are finished with it. -{% include 'fragments/footer.gmi' %} - ### Backlinks For a given page in Geminispace, backlinks are all the other pages in Geminispace that link to that page. When viewing GUS search results in verbose mode, a link to view each result's backlinks, if there any, will be provided. The URL structure for retrieving a certain URL's backlinks page is predictable, should you want to link directly to it in other contexts. All you need to do is URL encode the entire URL you want information on, then pass that as a query to gemini://gus.guru/backlinks. An example follows: -=> gemini://gus.guru/backlinks?gemini%3A//envs.net/~coleman/journal.gmi +=> gemini://gus.guru/backlinks?gemini%3A//gus.guru/ + +{% include 'fragments/footer.gmi' %}