geminispace.info

gemini search engine
git clone https://git.clttr.info/geminispace.info.git
Log (Feed) | Files | Refs (Tags) | README | LICENSE

commit b6ffd1fc684e26be9037d2959682590bb959ee20
parent b8fb69f84254a1cfd695683dd50d159e59d9ab14
Author: Natalie Pendragon <natpen@natpen.net>
Date:   Tue, 30 Jun 2020 07:07:36 -0400

[crawl] [serve] Add backlinks

Diffstat:
Mgus/crawl.py | 56++++++++++++++++++++++++++++++++++++++++++++++++--------
Mserve/models.py | 10+++++++++-
Aserve/templates/backlinks.gmi | 16++++++++++++++++
Mserve/templates/fragments/footer.gmi | 2+-
Mserve/templates/search.gmi | 5+++++
Mserve/views.py | 16+++++++++++++++-
6 files changed, 94 insertions(+), 11 deletions(-)

diff --git a/gus/crawl.py b/gus/crawl.py @@ -10,11 +10,12 @@ from urllib.parse import urljoin, uses_relative, uses_netloc import gusmobile as gemini from whoosh.analysis import FancyAnalyzer -from whoosh.fields import Schema, TEXT, DATETIME, STORED, ID, NUMERIC +from whoosh.fields import Schema, TEXT, DATETIME, STORED, ID, NUMERIC, KEYWORD from whoosh.filedb.filestore import FileStorage from whoosh.index import create_in, open_dir from whoosh.query import Every from whoosh.qparser import QueryParser +from whoosh.writing import BufferedWriter from gus.lib.index_statistics import compute_index_statistics, persist_statistics, print_index_statistics from gus.lib.whoosh_extensions import UrlAnalyzer @@ -187,6 +188,9 @@ def create_index(index_dir): shutil.rmtree(index_dir, ignore_errors=True) pathlib.Path(index_dir).mkdir(parents=True, exist_ok=True) schema = Schema( + url_id=ID( + unique=True, + ), url=TEXT( field_boost=2.0, stored=True, @@ -221,8 +225,11 @@ def create_index(index_dir): signed=False, stored=True, ), + backlinks=KEYWORD( + stored=True, + ), indexed_at=DATETIME( - stored=True + stored=True, ), ) index_storage.create_index(schema) @@ -230,8 +237,11 @@ def create_index(index_dir): def index_binary(resource, response): print("INDEXING BINARY...") - - index_writer.add_document( + with index_writer.searcher() as searcher: + result = searcher.document(url_id=resource.indexable_url) + update_document( + result, + url_id=resource.indexable_url, url=resource.indexable_url, fetchable_url=resource.fetchable_url, domain=resource.normalized_host, @@ -244,7 +254,11 @@ def index_binary(resource, response): def index_prompt(resource, response): print("INDEXING PROMPT...") - index_writer.add_document( + with index_writer.searcher() as searcher: + result = searcher.document(url_id=resource.indexable_url) + update_document( + result, + url_id=resource.indexable_url, url=resource.indexable_url, fetchable_url=resource.fetchable_url, domain=resource.normalized_host, @@ -259,6 +273,7 @@ def index_prompt(resource, response): def index_content(resource, response): print("INDEXING CONTENT...") doc = { + "url_id": resource.indexable_url, "url": resource.indexable_url, "fetchable_url": resource.fetchable_url, "domain": resource.normalized_host, @@ -270,7 +285,29 @@ def index_content(resource, response): } if response.content_type == "text/gemini": doc["lang"] = response.lang or "none", - index_writer.add_document(**doc) + with index_writer.searcher() as searcher: + result = searcher.document(url_id=resource.indexable_url) + update_document(result, **doc) + + +def index_backlinks(resource, contained_resources): + for cr in contained_resources: + with ix.searcher() as searcher: + result = searcher.document(url_id=resource.indexable_url) + backlinks = set() + if result and "backlinks" in result: + backlinks = set(result["backlinks"].split()) + backlinks.add(resource.fetchable_url) + update_document(result, url_id=cr.indexable_url, backlinks=" ".join(backlinks)) + + +def update_document(document, **kwargs): + if not document: + document = {} + # pdb.set_trace() + for key, value in kwargs.items(): + document[key] = value + index_writer.update_document(**document) def get_robots_file(robot_host): @@ -380,6 +417,7 @@ def crawl(gemini_resource): print("Extracting contained resources...") print("--------------------------") contained_resources = gr.extract_contained_resources(response.content) + index_backlinks(gr, contained_resources) for resource in contained_resources: crawl(resource) else: @@ -435,8 +473,10 @@ def run_crawl(should_run_destructive=False, seed_urls=[]): if should_run_destructive: backup_old_index(INDEX_DIR_CURRENT, INDEX_DIR_BACKUP) create_index(index_dir) + global ix + ix = index_storage.open_index() global index_writer - index_writer = index_storage.open_index().writer() + index_writer = BufferedWriter(ix, period=120, limit=1) global visited_urls visited_urls = [] if should_run_destructive else load_visited_urls(INDEX_DIR_CURRENT) @@ -463,7 +503,7 @@ def run_crawl(should_run_destructive=False, seed_urls=[]): seed_request_resources = [GeminiResource(url) for url in seed_request_urls] for resource in seed_request_resources: crawl(resource) - index_writer.commit() + index_writer.close() pickle_robot_file_map(robot_file_map, index_dir) diff --git a/serve/models.py b/serve/models.py @@ -5,6 +5,7 @@ from whoosh import highlight, qparser from whoosh.index import open_dir from . import constants +from gus.lib.gemini import GeminiResource from gus.lib.index_statistics import compute_index_statistics, load_last_statistics_from_file from gus.lib.misc import bytes2human from gus.lib.whoosh_extensions import GeminiFormatter, GeminiScorer @@ -24,7 +25,7 @@ class GUS(): def init_query_parser(ix): - or_group = qparser.OrGroup.factory(1.0) + or_group = qparser.OrGroup.factory(0.99) query_parser = qparser.MultifieldParser(["content", "url", "prompt"], ix.schema, group=or_group) query_parser.add_plugin(qparser.RegexPlugin()) query_parser.add_plugin(qparser.GtLtPlugin()) @@ -65,10 +66,17 @@ class GUS(): "prompt" : result["prompt"] if "prompt" in result else "", "highlights" : self.gemini_highlighter.highlight_hit(result, "content", top=1) if "content" in result and result["content_type"] in ["text/plain", "text/gemini", "text/markdown"] else "", "link_text" : GUS._get_link_text(result), + "backlinks" : result["backlinks"].split(" ") if "backlinks" in result else [], } for result in results], ) + def get_backlinks(self, url): + resource = GeminiResource(url) + result = self.searcher.document(url_id=resource.indexable_url) + return result["backlinks"].split(" ") if result and "backlinks" in result else [] + + def _get_link_text(result): if result["content_type"] == "input": prompt_suffix = ": {}".format(result["prompt"]) diff --git a/serve/templates/backlinks.gmi b/serve/templates/backlinks.gmi @@ -0,0 +1,16 @@ +{% include 'fragments/header.gmi' %} + + +{% if backlinks | length > 1 %} +## {{ backlinks|length }} backlinks for {{ url }} +{% elif backlinks | length > 0 %} +## {{ backlinks|length }} backlink for {{ url }} +{% else %} +## No backlinks for {{ url }} +{% endif %} + +{% for backlink in backlinks %} +=> {{ backlink }} {{ backlink[9:] }} +{% endfor %} + +{% include 'fragments/footer.gmi' %} diff --git a/serve/templates/fragments/footer.gmi b/serve/templates/fragments/footer.gmi @@ -1,3 +1,3 @@ -=> /add-seed See any missing results? Let GUS know your gemini URL exists. +=> /add-seed See any missing results? Let GUS know your Gemini URL exists. Index updated on: {{ index_modification_time|datetimeformat }} diff --git a/serve/templates/search.gmi b/serve/templates/search.gmi @@ -12,6 +12,11 @@ * Indexed at : {{ "{:%Y-%m-%d %H:%M}".format(result["indexed_at"]) }} * Charset : {{ result["charset"] }} {% endif %} +{% if result["backlinks"] | length > 1 %} +=> /backlinks?{{ result["url"] | urlencode }} {{ result["backlinks"] | length }} backlinks +{% elif result["backlinks"] | length > 0 %} +=> /backlinks?{{ result["url"] | urlencode }} {{ result["backlinks"] | length }} backlink +{% endif %} {% if result["highlights"] | length > 0 %} {{ result["highlights"] }} {% endif %} diff --git a/serve/views.py b/serve/views.py @@ -1,7 +1,7 @@ import math import os from datetime import datetime -from urllib.parse import quote +from urllib.parse import quote, unquote import jinja2 from jetforce import Request, Response, Status, JetforceApplication @@ -108,3 +108,17 @@ def search(request): return Response(Status.SUCCESS, "text/gemini", body) else: return Response(Status.INPUT, "Search query") + + +@app.route("/backlinks") +def backlinks(request): + if request.query: + url = unquote(request.query) + backlinks = gus.get_backlinks(url) + body = render_template("backlinks.gmi", + url=url, + backlinks=backlinks, + index_modification_time=gus.statistics["index_modification_time"]) + return Response(Status.SUCCESS, "text/gemini", body) + else: + return Response(Status.INPUT, "Gemini URL")