[crawl] Start indexing the charset - geminispace.info

commit 8a1cafaffb5a36ce9e965486a09a24927d323e07
parent 245f74634afb8c71cfc0a73c176f99980f1d7bff
Author: Natalie Pendragon <natpen@natpen.net>
Date:   Wed,  3 Jun 2020 16:28:13 -0400

[crawl] Start indexing the charset

Diffstat:
M gus/crawl.py  | 6 ++++++
M gus/lib/index_statistics.py  | 34 ++++++++++++++++++++++++++++------
M gus/serve.py  | 22 +++++++++++++++++++++-
M poetry.lock  | 2 +-

4 files changed, 56 insertions(+), 8 deletions(-)
diff --git a/gus/crawl.py b/gus/crawl.py
@@ -145,6 +145,9 @@ def create_index(index_dir):
         content_type=TEXT(
             stored=True,
         ),
+        charset=ID(
+            stored=True,
+        ),
         content=TEXT(
             analyzer=FancyAnalyzer(),
             spelling=True,
@@ -171,6 +174,7 @@ def index_binary(resource, response):
             fetchable_url=resource.fetchable_url,
             domain=resource.normalized_host,
             content_type=response.content_type,
+            charset=response.charset or "none",
             indexed_at=datetime.utcnow(),
         )
         index_writer.commit()
@@ -187,6 +191,7 @@ def index_prompt(resource, response):
             fetchable_url=resource.fetchable_url,
             domain=resource.normalized_host,
             content_type="input",
+            charset=response.charset or "none",
             prompt=response.prompt,
             indexed_at=datetime.utcnow(),
         )
@@ -204,6 +209,7 @@ def index_content(resource, response):
             fetchable_url=resource.fetchable_url,
             domain=resource.normalized_host,
             content_type=response.content_type,
+            charset=response.charset or "none",
             content=response.content,
             regex=response.content,
             indexed_at=datetime.utcnow(),
diff --git a/gus/lib/index_statistics.py b/gus/lib/index_statistics.py
@@ -11,16 +11,22 @@ from gus.lib.gemini import GeminiResource
 def compute_index_statistics(index_dir):
     ix = open_dir(index_dir)
 
-    # content types
-    content_types = None
+    # content types and charsets
+    content_types = set()
+    charsets = set()
     with ix.reader() as reader:
-        all_stored_fields = reader.all_stored_fields()
-        content_types = set([f["content_type"] for f in all_stored_fields])
+        for fields in reader.all_stored_fields():
+            if "charset" in fields:
+                charset = fields["charset"]
+                charsets.add(charset)
+            content_type = fields["content_type"]
+            content_types.add(content_type)
 
     # page count, domain count, content type frequencies
     page_count = 0
     domain_count = 0
     content_type_frequencies = []
+    charset_frequencies = []
     with ix.searcher() as searcher:
         page_count = searcher.doc_count()
 
@@ -32,6 +38,14 @@ def compute_index_statistics(index_dir):
             content_type_frequencies.append((content_type, len(results)))
         content_type_frequencies.sort(key=lambda pair: pair[1], reverse=True)
 
+        # charset frequencies
+        parser = QueryParser("charset", schema=ix.schema)
+        for charset in charsets:
+            query = parser.parse("charset:{}".format(charset))
+            results = searcher.search(query, limit=9999999)
+            charset_frequencies.append((charset, len(results)))
+        charset_frequencies.sort(key=lambda pair: pair[1], reverse=True)
+
         query = Every("url")
         results = searcher.search(query, limit=9999999)
         domains = set()
@@ -47,6 +61,7 @@ def compute_index_statistics(index_dir):
         "page_count": page_count,
         "domain_count": domain_count,
         "content_type_frequencies": content_type_frequencies,
+        "charset_frequencies": charset_frequencies,
         "domains": domains,
     }
 
@@ -67,10 +82,14 @@ def print_index_statistics(index_statistics, crawl_statistics):
     for pair in index_statistics["content_type_frequencies"]:
         print("{:>5} - {}".format(pair[1], pair[0]))
 
+    print("\nCharsets:")
+    for pair in index_statistics["charset_frequencies"]:
+        print("{:>5} - {}".format(pair[1], pair[0]))
+
 
 def run_index_statistics():
     index_statistics = compute_index_statistics("index")
-    print_index_statistics(index_statistics)
+    print_index_statistics(index_statistics, None)
     # persist_index_statistics(index_statistics, "index-statistics.csv")
 
 
@@ -80,7 +99,7 @@ def persist_statistics(index_statistics, crawl_statistics, was_destructive, file
 
 
 def serialize_statistics_line(index_statistics, crawl_statistics, was_destructive):
-    return "{:%Y-%m-%d},{},{},{},{},{},{},{},{}\n".format(
+    return "{:%Y-%m-%d},{},{},{},{},{},{},{},{},{}\n".format(
         index_statistics["index_modification_time"],
         was_destructive,
         index_statistics["page_count"],
@@ -90,6 +109,7 @@ def serialize_statistics_line(index_statistics, crawl_statistics, was_destructiv
         crawl_statistics["broken_url_count"],
         "|".join(index_statistics["domains"]),
         "|".join("{}:{}".format(pair[0], pair[1]) for pair in index_statistics["content_type_frequencies"]),
+        "|".join("{}:{}".format(pair[0], pair[1]) for pair in index_statistics["charset_frequencies"]),
     )
 
 
@@ -112,6 +132,7 @@ def deserialize_statistics_line(line):
     broken_url_count = line_parts[6]
     domains = [domain for domain in line_parts[7].split("|")]
     content_type_frequencies = [(pair.split(":")[0], pair.split(":")[1]) for pair in line_parts[8].split("|")]
+    charset_frequencies = [(pair.split(":")[0], pair.split(":")[1]) for pair in line_parts[9].split("|")]
 
     return {
         "index_modification_time": index_modification_time,
@@ -122,4 +143,5 @@ def deserialize_statistics_line(line):
         "broken_url_count": broken_url_count,
         "domains": domains,
         "content_type_frequencies": content_type_frequencies,
+        "charset_frequencies": charset_frequencies,
     }
diff --git a/gus/serve.py b/gus/serve.py
@@ -40,6 +40,7 @@ def load_and_compute_statistics(filename):
     statistics["page_count"] = index_statistics["page_count"]
     statistics["domain_count"] = index_statistics["domain_count"]
     statistics["content_type_frequencies"] = index_statistics["content_type_frequencies"]
+    statistics["charset_frequencies"] = index_statistics["charset_frequencies"]
     statistics["domains"] = index_statistics["domains"]
     return statistics
 
@@ -83,6 +84,15 @@ def _render_index_statistics():
     ]
     for pair in last_statistics["content_type_frequencies"]:
         d.append("{:>5} - {}".format(pair[1], pair[0]))
+    d.extend([
+        "",
+        "## By Charset",
+        "",
+        "These figures are representative of the number of pages seen per content type at the time the current index was last updated on {:%Y-%m-%d}.".format(last_statistics["index_modification_time"]),
+        "",
+    ])
+    for pair in last_statistics["charset_frequencies"]:
+        d.append("{:>5} - {}".format(pair[1], pair[0]))
     return d
 
 
@@ -106,6 +116,10 @@ def _render_news():
     ]
     news_items = [
         {
+            "date": "2020-06-03",
+            "content": "Added ability to search and filter by charset. Documentation for this feature can be found on the advanced searching section of the about page!",
+        },
+        {
             "date": "2020-05-21",
             "content": "Added ability to search and filter by domain. Documentation for this feature can be found on the advanced searching section of the about page!",
         },
@@ -188,6 +202,7 @@ def index(request):
         "To improve the quality of your search results, you can apply filters to constrain your search results in various dimensions. The currently implemented filters are:",
         "* content_type",
         "* domain",
+        "* charset",
         "",
         "To filter by one of these, simply add it to your query followed by a colon, and the value you wish to filter by. Some examples of doing so follow.",
         "",
@@ -199,7 +214,10 @@ def index(request):
         "=> /search?domain:circumlunar domain:circumlunar",
         "=> /search?contextual%20domain:gus contextual domain:gus",
         "",
-        "For further inspiration on how to use these filters, you can visit both GUS' list of known hosts, as well as GUS' list of known content_types on the statistics page.",
+        "=> /search?computers%20content_type%3Agemini%20AND%20NOT%20charset%3AUS-ASCII computers content_type:gemini AND NOT charset:US-ASCII",
+        "=> /search?NOT%20charset%3Anone NOT charset:none",
+        "",
+        "For further inspiration on how to use these filters, you can visit both GUS' list of known hosts, as well as GUS' list of known content_types and charsets on the statistics page. Note that there is some nuance to the charset values, due to the fact that specifying them is optional, and if one does not specify, there is a default of utf-8 - pages that do not specify a charset have an indexed charset value of \"none\".",
         "",
         "=> /known-hosts GUS Known Hosts (with list of domains)",
         "=> /statistics GUS statistics (with list of content_types)",
@@ -254,6 +272,7 @@ def _search_index(query, requested_page):
             "url"          : result["url"],
             "fetchable_url": result["fetchable_url"],
             "content_type" : result["content_type"],
+            "charset"      : result["charset"] if "charset" in result else "none",
             "prompt"       : result["prompt"] if "prompt" in result else "",
             "highlights"   : gemini_highlighter.highlight_hit(result, "content", top=1) if "content" in result and result["content_type"] in ["text/plain", "text/gemini", "text/markdown"] else "",
         } for result in results
@@ -300,6 +319,7 @@ def _render_results(results, verbose=False):
         if verbose:
             data.append("* Score      : {:.2f}".format(result["score"]))
             data.append("* Indexed at : {:%Y-%m-%d %H:%M}".format(result["indexed_at"]))
+            data.append("* Charset    : {}".format(result["charset"]))
         if len(result["highlights"]) > 0:
             data.extend(result["highlights"].split(GeminiFormatter.between))
     return data
diff --git a/poetry.lock b/poetry.lock
@@ -100,7 +100,7 @@ python-versions = "*"
 version = "0.1.0"
 
 [package.source]
-reference = "a252aed301aa182a19571465c725a832530f95c7"
+reference = "c8867e2a90165958ae58e444791c0003329c6501"
 type = "git"
 url = "https://git.sr.ht/~natpen/gusmobile"
 [[package]]

	geminispace.info gemini search engine
	git clone https://git.clttr.info/geminispace.info.git
	Log (Feed) \| Files \| Refs (Tags) \| README \| LICENSE

M	gus/crawl.py	\|	6	++++++
M	gus/lib/index_statistics.py	\|	34	++++++++++++++++++++++++++++------
M	gus/serve.py	\|	22	+++++++++++++++++++++-
M	poetry.lock	\|	2	+-