commit e61d608c8e2b9171a47ff954ecf119913e351352
parent 0a9ac040af14daff771aa43dfbdf61b214ddb9a4
Author: Natalie Pendragon <natpen@natpen.net>
Date: Sun, 24 May 2020 10:08:37 -0400
[crawl] Stop storing responses in GeminiResource objects
I think this was causing memory overflows, since we were storing
potentially a lot of response content in memory without being able to
clean it up during long chains of recursive calls to crawl() of
contained resources.
Diffstat:
2 files changed, 33 insertions(+), 37 deletions(-)
diff --git a/gus/crawl.py b/gus/crawl.py
@@ -146,14 +146,14 @@ def create_index(index_dir):
index_storage.create_index(schema)
-def index_binary(resource):
+def index_binary(resource, response):
print("INDEXING BINARY...")
index_writer = index_storage.open_index().writer()
try:
index_writer.add_document(
url=resource.fully_qualified_massaged_url,
domain=resource.normalized_host,
- content_type=resource.response.content_type,
+ content_type=response.content_type,
indexed_at=datetime.utcnow(),
)
index_writer.commit()
@@ -161,7 +161,7 @@ def index_binary(resource):
index_writer.cancel()
-def index_prompt(resource):
+def index_prompt(resource, response):
print("INDEXING PROMPT...")
index_writer = index_storage.open_index().writer()
try:
@@ -169,7 +169,7 @@ def index_prompt(resource):
url=resource.fully_qualified_massaged_url,
domain=resource.normalized_host,
content_type="input",
- prompt=resource.response.prompt,
+ prompt=response.prompt,
indexed_at=datetime.utcnow(),
)
index_writer.commit()
@@ -177,15 +177,15 @@ def index_prompt(resource):
index_writer.cancel()
-def index_content(resource):
+def index_content(resource, response):
print("INDEXING CONTENT...")
index_writer = index_storage.open_index().writer()
try:
index_writer.add_document(
url=resource.fully_qualified_massaged_url,
domain=resource.normalized_host,
- content_type=resource.response.content_type,
- content=resource.response.content,
+ content_type=response.content_type,
+ content=response.content,
indexed_at=datetime.utcnow(),
)
index_writer.commit()
@@ -252,16 +252,16 @@ def crawl(gemini_resource):
print("Fetching {}".format(gr.fully_qualified_url))
if gr.fully_qualified_parent_url is not None:
print("With parent {}".format(gr.fully_qualified_parent_url))
- r = gr.fetch()
+ response = gr.fetch()
- if r is None:
+ if response is None:
# problem before getting a response
print("ERROR : %s" % gr.fully_qualified_url)
print("--------------------------")
crawl_statistics["broken_url_count"] += 1
- elif r.status.startswith("3"):
+ elif response.status.startswith("3"):
# redirect status
- print("REDIRECT : %s -> %s" % (gr.fully_qualified_url, r.url))
+ print("REDIRECT : %s -> %s" % (gr.fully_qualified_url, response.url))
# NB: this pop is necessary because if the redirect is a change to the URL
# structure of, essentially, the same URL (e.g., like the addition or removal
# of a trailing slash), then the crawl of the redirect would think it had
@@ -270,28 +270,28 @@ def crawl(gemini_resource):
crawl_statistics["redirect_count"] += 1
# if is_nontrivial_redirect(gr.fully_qualified_url, r.url):
# crawl_statistics["redirect_nontrivial_count"] += 1
- redirect_resource = GeminiResource(r.url, gr.normalized_url, gr.normalized_host)
+ redirect_resource = GeminiResource(response.url, gr.normalized_url, gr.normalized_host)
crawl(redirect_resource)
- elif r.status.startswith("1"):
+ elif response.status.startswith("1"):
# input status
- print("URL : %s" % r.url)
- print("STATUS : %s" % r.status)
- print("PROMPT : %s" % r.prompt)
- index_prompt(gr)
+ print("URL : %s" % response.url)
+ print("STATUS : %s" % response.status)
+ print("PROMPT : %s" % response.prompt)
+ index_prompt(gr, response)
print("--------------------------")
- elif r.status.startswith("2"):
+ elif response.status.startswith("2"):
# success status
- print("URL : %s" % r.url)
- print("STATUS : %s" % r.status)
- print("CONTENT TYPE : %s" % r.content_type)
- if r.content_type.startswith("text/"):
- index_content(gr)
+ print("URL : %s" % response.url)
+ print("STATUS : %s" % response.status)
+ print("CONTENT TYPE : %s" % response.content_type)
+ if response.content_type.startswith("text/"):
+ index_content(gr, response)
print("--------------------------")
- contained_resources = gr.extract_contained_resources()
+ contained_resources = gr.extract_contained_resources(response.content)
for resource in contained_resources:
crawl(resource)
else:
- index_binary(gr)
+ index_binary(gr, response)
print("--------------------------")
else:
# input, error, etc (all other statuses)
diff --git a/gus/lib/gemini.py b/gus/lib/gemini.py
@@ -22,14 +22,14 @@ class GeminiRobotFileParser(RobotFileParser):
def read(self):
"""Reads the robots.txt URL and feeds it to the parser."""
gr = GeminiResource(self.url)
- gr.fetch()
- if gr.response is None:
+ response = gr.fetch()
+ if response is None:
self.allow_all = True
return
- if not gr.response.status.startswith("2"):
+ if not response.status.startswith("2"):
self.allow_all = True
else:
- self.parse(gr.response.content.splitlines())
+ self.parse(response.content.splitlines())
class GeminiResource():
@@ -147,9 +147,7 @@ class GeminiResource():
# NB: this intentionally does NOT fetch the normalized URL, because that could
# cause an infinite loop with, e.g., normalization stripping a trailing slash
# and a server redirecting to the same URL _with_ a trailing slash.
- response = gusmobile.fetch(self.fully_qualified_url)
- self.response = response
- return self.response
+ return gusmobile.fetch(self.fully_qualified_url)
def _get_normalized_url_and_host(self):
@@ -160,16 +158,14 @@ class GeminiResource():
return url_normalized, host_normalized
- def extract_contained_resources(self):
+ def extract_contained_resources(self, content):
# this finds all gemini URLs within the content of a given GeminiResource and
- # returns them as a list of new, unfetched GeminiResources
- if not self.response:
- pass
+ # returns them as a list of new GeminiResources
if self.contained_resources:
return self.contained_resources
link_pattern = "^=>\s*(\S+)"
- probable_urls = re.findall(link_pattern, self.response.content, re.MULTILINE)
+ probable_urls = re.findall(link_pattern, content, re.MULTILINE)
resources = []
for url in probable_urls:
resource = GeminiResource(