commit b6ffd1fc684e26be9037d2959682590bb959ee20
parent b8fb69f84254a1cfd695683dd50d159e59d9ab14
Author: Natalie Pendragon <natpen@natpen.net>
Date: Tue, 30 Jun 2020 07:07:36 -0400
[crawl] [serve] Add backlinks
Diffstat:
6 files changed, 94 insertions(+), 11 deletions(-)
diff --git a/gus/crawl.py b/gus/crawl.py
@@ -10,11 +10,12 @@ from urllib.parse import urljoin, uses_relative, uses_netloc
import gusmobile as gemini
from whoosh.analysis import FancyAnalyzer
-from whoosh.fields import Schema, TEXT, DATETIME, STORED, ID, NUMERIC
+from whoosh.fields import Schema, TEXT, DATETIME, STORED, ID, NUMERIC, KEYWORD
from whoosh.filedb.filestore import FileStorage
from whoosh.index import create_in, open_dir
from whoosh.query import Every
from whoosh.qparser import QueryParser
+from whoosh.writing import BufferedWriter
from gus.lib.index_statistics import compute_index_statistics, persist_statistics, print_index_statistics
from gus.lib.whoosh_extensions import UrlAnalyzer
@@ -187,6 +188,9 @@ def create_index(index_dir):
shutil.rmtree(index_dir, ignore_errors=True)
pathlib.Path(index_dir).mkdir(parents=True, exist_ok=True)
schema = Schema(
+ url_id=ID(
+ unique=True,
+ ),
url=TEXT(
field_boost=2.0,
stored=True,
@@ -221,8 +225,11 @@ def create_index(index_dir):
signed=False,
stored=True,
),
+ backlinks=KEYWORD(
+ stored=True,
+ ),
indexed_at=DATETIME(
- stored=True
+ stored=True,
),
)
index_storage.create_index(schema)
@@ -230,8 +237,11 @@ def create_index(index_dir):
def index_binary(resource, response):
print("INDEXING BINARY...")
-
- index_writer.add_document(
+ with index_writer.searcher() as searcher:
+ result = searcher.document(url_id=resource.indexable_url)
+ update_document(
+ result,
+ url_id=resource.indexable_url,
url=resource.indexable_url,
fetchable_url=resource.fetchable_url,
domain=resource.normalized_host,
@@ -244,7 +254,11 @@ def index_binary(resource, response):
def index_prompt(resource, response):
print("INDEXING PROMPT...")
- index_writer.add_document(
+ with index_writer.searcher() as searcher:
+ result = searcher.document(url_id=resource.indexable_url)
+ update_document(
+ result,
+ url_id=resource.indexable_url,
url=resource.indexable_url,
fetchable_url=resource.fetchable_url,
domain=resource.normalized_host,
@@ -259,6 +273,7 @@ def index_prompt(resource, response):
def index_content(resource, response):
print("INDEXING CONTENT...")
doc = {
+ "url_id": resource.indexable_url,
"url": resource.indexable_url,
"fetchable_url": resource.fetchable_url,
"domain": resource.normalized_host,
@@ -270,7 +285,29 @@ def index_content(resource, response):
}
if response.content_type == "text/gemini":
doc["lang"] = response.lang or "none",
- index_writer.add_document(**doc)
+ with index_writer.searcher() as searcher:
+ result = searcher.document(url_id=resource.indexable_url)
+ update_document(result, **doc)
+
+
+def index_backlinks(resource, contained_resources):
+ for cr in contained_resources:
+ with ix.searcher() as searcher:
+ result = searcher.document(url_id=resource.indexable_url)
+ backlinks = set()
+ if result and "backlinks" in result:
+ backlinks = set(result["backlinks"].split())
+ backlinks.add(resource.fetchable_url)
+ update_document(result, url_id=cr.indexable_url, backlinks=" ".join(backlinks))
+
+
+def update_document(document, **kwargs):
+ if not document:
+ document = {}
+ # pdb.set_trace()
+ for key, value in kwargs.items():
+ document[key] = value
+ index_writer.update_document(**document)
def get_robots_file(robot_host):
@@ -380,6 +417,7 @@ def crawl(gemini_resource):
print("Extracting contained resources...")
print("--------------------------")
contained_resources = gr.extract_contained_resources(response.content)
+ index_backlinks(gr, contained_resources)
for resource in contained_resources:
crawl(resource)
else:
@@ -435,8 +473,10 @@ def run_crawl(should_run_destructive=False, seed_urls=[]):
if should_run_destructive:
backup_old_index(INDEX_DIR_CURRENT, INDEX_DIR_BACKUP)
create_index(index_dir)
+ global ix
+ ix = index_storage.open_index()
global index_writer
- index_writer = index_storage.open_index().writer()
+ index_writer = BufferedWriter(ix, period=120, limit=1)
global visited_urls
visited_urls = [] if should_run_destructive else load_visited_urls(INDEX_DIR_CURRENT)
@@ -463,7 +503,7 @@ def run_crawl(should_run_destructive=False, seed_urls=[]):
seed_request_resources = [GeminiResource(url) for url in seed_request_urls]
for resource in seed_request_resources:
crawl(resource)
- index_writer.commit()
+ index_writer.close()
pickle_robot_file_map(robot_file_map, index_dir)
diff --git a/serve/models.py b/serve/models.py
@@ -5,6 +5,7 @@ from whoosh import highlight, qparser
from whoosh.index import open_dir
from . import constants
+from gus.lib.gemini import GeminiResource
from gus.lib.index_statistics import compute_index_statistics, load_last_statistics_from_file
from gus.lib.misc import bytes2human
from gus.lib.whoosh_extensions import GeminiFormatter, GeminiScorer
@@ -24,7 +25,7 @@ class GUS():
def init_query_parser(ix):
- or_group = qparser.OrGroup.factory(1.0)
+ or_group = qparser.OrGroup.factory(0.99)
query_parser = qparser.MultifieldParser(["content", "url", "prompt"], ix.schema, group=or_group)
query_parser.add_plugin(qparser.RegexPlugin())
query_parser.add_plugin(qparser.GtLtPlugin())
@@ -65,10 +66,17 @@ class GUS():
"prompt" : result["prompt"] if "prompt" in result else "",
"highlights" : self.gemini_highlighter.highlight_hit(result, "content", top=1) if "content" in result and result["content_type"] in ["text/plain", "text/gemini", "text/markdown"] else "",
"link_text" : GUS._get_link_text(result),
+ "backlinks" : result["backlinks"].split(" ") if "backlinks" in result else [],
} for result in results],
)
+ def get_backlinks(self, url):
+ resource = GeminiResource(url)
+ result = self.searcher.document(url_id=resource.indexable_url)
+ return result["backlinks"].split(" ") if result and "backlinks" in result else []
+
+
def _get_link_text(result):
if result["content_type"] == "input":
prompt_suffix = ": {}".format(result["prompt"])
diff --git a/serve/templates/backlinks.gmi b/serve/templates/backlinks.gmi
@@ -0,0 +1,16 @@
+{% include 'fragments/header.gmi' %}
+
+
+{% if backlinks | length > 1 %}
+## {{ backlinks|length }} backlinks for {{ url }}
+{% elif backlinks | length > 0 %}
+## {{ backlinks|length }} backlink for {{ url }}
+{% else %}
+## No backlinks for {{ url }}
+{% endif %}
+
+{% for backlink in backlinks %}
+=> {{ backlink }} {{ backlink[9:] }}
+{% endfor %}
+
+{% include 'fragments/footer.gmi' %}
diff --git a/serve/templates/fragments/footer.gmi b/serve/templates/fragments/footer.gmi
@@ -1,3 +1,3 @@
-=> /add-seed See any missing results? Let GUS know your gemini URL exists.
+=> /add-seed See any missing results? Let GUS know your Gemini URL exists.
Index updated on: {{ index_modification_time|datetimeformat }}
diff --git a/serve/templates/search.gmi b/serve/templates/search.gmi
@@ -12,6 +12,11 @@
* Indexed at : {{ "{:%Y-%m-%d %H:%M}".format(result["indexed_at"]) }}
* Charset : {{ result["charset"] }}
{% endif %}
+{% if result["backlinks"] | length > 1 %}
+=> /backlinks?{{ result["url"] | urlencode }} {{ result["backlinks"] | length }} backlinks
+{% elif result["backlinks"] | length > 0 %}
+=> /backlinks?{{ result["url"] | urlencode }} {{ result["backlinks"] | length }} backlink
+{% endif %}
{% if result["highlights"] | length > 0 %}
{{ result["highlights"] }}
{% endif %}
diff --git a/serve/views.py b/serve/views.py
@@ -1,7 +1,7 @@
import math
import os
from datetime import datetime
-from urllib.parse import quote
+from urllib.parse import quote, unquote
import jinja2
from jetforce import Request, Response, Status, JetforceApplication
@@ -108,3 +108,17 @@ def search(request):
return Response(Status.SUCCESS, "text/gemini", body)
else:
return Response(Status.INPUT, "Search query")
+
+
+@app.route("/backlinks")
+def backlinks(request):
+ if request.query:
+ url = unquote(request.query)
+ backlinks = gus.get_backlinks(url)
+ body = render_template("backlinks.gmi",
+ url=url,
+ backlinks=backlinks,
+ index_modification_time=gus.statistics["index_modification_time"])
+ return Response(Status.SUCCESS, "text/gemini", body)
+ else:
+ return Response(Status.INPUT, "Gemini URL")