commit 20a5cb896d0eee3d271565ce420f74a598e1f94b
parent d16c11de01f336034c8b4e70333c104b5c0f77a0
Author: Natalie Pendragon <natpen@natpen.net>
Date: Sat, 16 May 2020 08:00:35 -0400
[crawl] [serve] Add more statistics
Diffstat:
4 files changed, 55 insertions(+), 20 deletions(-)
diff --git a/.gitignore b/.gitignore
@@ -132,3 +132,4 @@ dmypy.json
/seed-requests.txt
/index-statistics.csv
+/statistics.csv
diff --git a/gus/crawl.py b/gus/crawl.py
@@ -15,7 +15,7 @@ from whoosh.index import create_in, open_dir
from whoosh.query import Every
from whoosh.qparser import QueryParser
-from gus.lib.index_statistics import compute_index_statistics, persist_index_statistics
+from gus.lib.index_statistics import compute_index_statistics, persist_statistics
from gus.lib.whoosh_extensions import UrlAnalyzer
from gus.lib.url_helpers import normalize_gemini_url
@@ -244,10 +244,14 @@ def crawl_url(url):
# problem before getting a response
print("ERROR : %s" % url)
print("--------------------------")
+ crawl_statistics["broken_url_count"] += 1
elif r.status.startswith("3"):
# redirect status
print("REDIRECT : %s -> %s" % (url, r.url))
visited_urls.pop()
+ crawl_statistics["redirect_count"] += 1
+ if is_nontrivial_redirect(url, r.url):
+ crawl_statistics["redirect_nontrivial_count"] += 1
crawl_url(r.url)
elif r.status.startswith("1"):
# input status
@@ -276,6 +280,10 @@ def crawl_url(url):
print("--------------------------")
+def is_nontrivial_redirect(url, redirect_url):
+ return url.rstrip() != redirect_url.rstrip()
+
+
def main():
create_index(INDEX_DIR)
global visited_urls
@@ -284,16 +292,30 @@ def main():
robot_file_map = {}
global domain_hit_timings
domain_hit_timings = {}
+ global crawl_statistics
+ crawl_statistics = {
+ # any redirect counts
+ "redirect_count": 0,
+ # more than just adding/removing trailing slash
+ "redirect_nontrivial_count": 0,
+ "broken_url_count": 0,
+ }
for url in SEED_URLS:
crawl_url(url)
+
index_statistics = compute_index_statistics("index")
print("Page count: {}".format(index_statistics["page_count"]))
print("Domain count: {}".format(index_statistics["domain_count"]))
print("Domains: {}".format(index_statistics["domains"]))
+
+ print("Redirect count: {}".format(crawl_statistics["redirect_count"]))
+ print("Nontrivial redirect count: {}".format(crawl_statistics["redirect_nontrivial_count"]))
+ print("Broken URL count: {}".format(crawl_statistics["broken_url_count"]))
+
print("\nContent Types:")
for pair in index_statistics["content_type_frequencies"]:
print(pair)
- persist_index_statistics(index_statistics, "index-statistics.csv")
+ persist_statistics(index_statistics, crawl_statistics, "statistics.csv")
if __name__ == "__main__":
diff --git a/gus/lib/index_statistics.py b/gus/lib/index_statistics.py
@@ -69,40 +69,50 @@ def run_index_statistics():
# persist_index_statistics(index_statistics, "index-statistics.csv")
-def persist_index_statistics(index_statistics, filename):
+def persist_statistics(index_statistics, crawl_statistics, filename):
with open(filename, "a") as f:
- f.write(serialize_index_statistics_line(index_statistics))
+ f.write(serialize_statistics_line(index_statistics, crawl_statistics))
-def serialize_index_statistics_line(index_statistics):
- return "{:%Y-%m-%d},{},{},{},{}\n".format(
+def serialize_statistics_line(index_statistics, crawl_statistics):
+ return "{:%Y-%m-%d},{},{},{},{},{},{},{}\n".format(
index_statistics["index_modification_time"],
index_statistics["page_count"],
index_statistics["domain_count"],
+ crawl_statistics["redirect_count"],
+ crawl_statistics["redirect_nontrivial_count"],
+ crawl_statistics["broken_url_count"],
"|".join(index_statistics["domains"]),
"|".join("{}:{}".format(pair[0], pair[1]) for pair in index_statistics["content_type_frequencies"]),
)
-def load_last_index_statistics_from_file(filename):
+def load_last_statistics_from_file(filename):
with open(filename) as f:
data = f.readlines()
lastline = data[-1].strip()
- index_statistics = deserialize_index_statistics_line(lastline)
- return index_statistics
+ statistics = deserialize_statistics_line(lastline)
+ return statistics
-def deserialize_index_statistics_line(line):
+def deserialize_statistics_line(line):
line_parts = line.split(",")
index_modification_time = line_parts[0]
page_count = line_parts[1]
domain_count = line_parts[2]
- domains = [domain for domain in line_parts[3].split("|")]
- content_type_frequencies = [(pair.split(":")[0], pair.split(":")[1]) for pair in line_parts[4].split("|")]
+ redirect_count = line_parts[3]
+ redirect_nontrivial_count = line_parts[4]
+ broken_url_count = line_parts[5]
+ domains = [domain for domain in line_parts[6].split("|")]
+ content_type_frequencies = [(pair.split(":")[0], pair.split(":")[1]) for pair in line_parts[7].split("|")]
+
return {
"index_modification_time": index_modification_time,
"page_count": page_count,
"domain_count": domain_count,
+ "redirect_count": redirect_count,
+ "redirect_nontrivial_count": redirect_nontrivial_count,
+ "broken_url_count": broken_url_count,
"domains": domains,
"content_type_frequencies": content_type_frequencies,
}
diff --git a/gus/serve.py b/gus/serve.py
@@ -13,9 +13,9 @@ from jetforce import Response, Status
from whoosh.index import open_dir
from whoosh.qparser import MultifieldParser
-from gus.lib.index_statistics import load_last_index_statistics_from_file
+from gus.lib.index_statistics import load_last_statistics_from_file
-last_index_statistics = load_last_index_statistics_from_file("index-statistics.csv")
+last_statistics = load_last_statistics_from_file("statistics.csv")
app = jetforce.JetforceApplication()
@@ -44,17 +44,19 @@ def _render_index_statistics():
"",
"## Overall",
"",
- "These figures are representative of the aggregate size of Geminispace at the time the current index was generated on {}.".format(last_index_statistics["index_modification_time"]),
+ "These figures are representative of the aggregate size of Geminispace at the time the current index was generated on {}.".format(last_statistics["index_modification_time"]),
"",
- "Page Count : {:>5}".format(last_index_statistics["page_count"]),
- "Domain Count : {:>5}".format(last_index_statistics["domain_count"]),
+ "Page Count : {:>5}".format(last_statistics["page_count"]),
+ "Domain Count : {:>5}".format(last_statistics["domain_count"]),
+ "Redirect Count : {:>5}".format(last_statistics["redirect_count"]),
+ "Broken URL Count : {:>5}".format(last_statistics["broken_url_count"]),
"",
"## By Content Type",
"",
- "These figures are representative of the number of pages seen per content type at the time the current index was generated on {}.".format(last_index_statistics["index_modification_time"]),
+ "These figures are representative of the number of pages seen per content type at the time the current index was generated on {}.".format(last_statistics["index_modification_time"]),
"",
]
- for pair in last_index_statistics["content_type_frequencies"]:
+ for pair in last_statistics["content_type_frequencies"]:
d.append("{:>5} - {}".format(pair[1], pair[0]))
return d
@@ -67,7 +69,7 @@ def _render_known_hosts():
"These are the hosts in Geminispace that GUS is aware of. Note that this list is auto-generated from the index, so if your host is not showing up here, it also won't have its content represented in GUS search results! There is a link at the bottom to add your host to the list of seeds for the next crawl of Geminispace though, after which your host should start showing up.",
"",
]
- for domain in last_index_statistics["domains"]:
+ for domain in last_statistics["domains"]:
d.append("=> {}".format(domain))
return d