geminispace.info

gemini search engine
git clone https://git.clttr.info/geminispace.info.git
Log (Feed) | Files | Refs (Tags) | README | LICENSE

commit e61d608c8e2b9171a47ff954ecf119913e351352
parent 0a9ac040af14daff771aa43dfbdf61b214ddb9a4
Author: Natalie Pendragon <natpen@natpen.net>
Date:   Sun, 24 May 2020 10:08:37 -0400

[crawl] Stop storing responses in GeminiResource objects

I think this was causing memory overflows, since we were storing
potentially a lot of response content in memory without being able to
clean it up during long chains of recursive calls to crawl() of
contained resources.

Diffstat:
Mgus/crawl.py | 50+++++++++++++++++++++++++-------------------------
Mgus/lib/gemini.py | 20++++++++------------
2 files changed, 33 insertions(+), 37 deletions(-)

diff --git a/gus/crawl.py b/gus/crawl.py @@ -146,14 +146,14 @@ def create_index(index_dir): index_storage.create_index(schema) -def index_binary(resource): +def index_binary(resource, response): print("INDEXING BINARY...") index_writer = index_storage.open_index().writer() try: index_writer.add_document( url=resource.fully_qualified_massaged_url, domain=resource.normalized_host, - content_type=resource.response.content_type, + content_type=response.content_type, indexed_at=datetime.utcnow(), ) index_writer.commit() @@ -161,7 +161,7 @@ def index_binary(resource): index_writer.cancel() -def index_prompt(resource): +def index_prompt(resource, response): print("INDEXING PROMPT...") index_writer = index_storage.open_index().writer() try: @@ -169,7 +169,7 @@ def index_prompt(resource): url=resource.fully_qualified_massaged_url, domain=resource.normalized_host, content_type="input", - prompt=resource.response.prompt, + prompt=response.prompt, indexed_at=datetime.utcnow(), ) index_writer.commit() @@ -177,15 +177,15 @@ def index_prompt(resource): index_writer.cancel() -def index_content(resource): +def index_content(resource, response): print("INDEXING CONTENT...") index_writer = index_storage.open_index().writer() try: index_writer.add_document( url=resource.fully_qualified_massaged_url, domain=resource.normalized_host, - content_type=resource.response.content_type, - content=resource.response.content, + content_type=response.content_type, + content=response.content, indexed_at=datetime.utcnow(), ) index_writer.commit() @@ -252,16 +252,16 @@ def crawl(gemini_resource): print("Fetching {}".format(gr.fully_qualified_url)) if gr.fully_qualified_parent_url is not None: print("With parent {}".format(gr.fully_qualified_parent_url)) - r = gr.fetch() + response = gr.fetch() - if r is None: + if response is None: # problem before getting a response print("ERROR : %s" % gr.fully_qualified_url) print("--------------------------") crawl_statistics["broken_url_count"] += 1 - elif r.status.startswith("3"): + elif response.status.startswith("3"): # redirect status - print("REDIRECT : %s -> %s" % (gr.fully_qualified_url, r.url)) + print("REDIRECT : %s -> %s" % (gr.fully_qualified_url, response.url)) # NB: this pop is necessary because if the redirect is a change to the URL # structure of, essentially, the same URL (e.g., like the addition or removal # of a trailing slash), then the crawl of the redirect would think it had @@ -270,28 +270,28 @@ def crawl(gemini_resource): crawl_statistics["redirect_count"] += 1 # if is_nontrivial_redirect(gr.fully_qualified_url, r.url): # crawl_statistics["redirect_nontrivial_count"] += 1 - redirect_resource = GeminiResource(r.url, gr.normalized_url, gr.normalized_host) + redirect_resource = GeminiResource(response.url, gr.normalized_url, gr.normalized_host) crawl(redirect_resource) - elif r.status.startswith("1"): + elif response.status.startswith("1"): # input status - print("URL : %s" % r.url) - print("STATUS : %s" % r.status) - print("PROMPT : %s" % r.prompt) - index_prompt(gr) + print("URL : %s" % response.url) + print("STATUS : %s" % response.status) + print("PROMPT : %s" % response.prompt) + index_prompt(gr, response) print("--------------------------") - elif r.status.startswith("2"): + elif response.status.startswith("2"): # success status - print("URL : %s" % r.url) - print("STATUS : %s" % r.status) - print("CONTENT TYPE : %s" % r.content_type) - if r.content_type.startswith("text/"): - index_content(gr) + print("URL : %s" % response.url) + print("STATUS : %s" % response.status) + print("CONTENT TYPE : %s" % response.content_type) + if response.content_type.startswith("text/"): + index_content(gr, response) print("--------------------------") - contained_resources = gr.extract_contained_resources() + contained_resources = gr.extract_contained_resources(response.content) for resource in contained_resources: crawl(resource) else: - index_binary(gr) + index_binary(gr, response) print("--------------------------") else: # input, error, etc (all other statuses) diff --git a/gus/lib/gemini.py b/gus/lib/gemini.py @@ -22,14 +22,14 @@ class GeminiRobotFileParser(RobotFileParser): def read(self): """Reads the robots.txt URL and feeds it to the parser.""" gr = GeminiResource(self.url) - gr.fetch() - if gr.response is None: + response = gr.fetch() + if response is None: self.allow_all = True return - if not gr.response.status.startswith("2"): + if not response.status.startswith("2"): self.allow_all = True else: - self.parse(gr.response.content.splitlines()) + self.parse(response.content.splitlines()) class GeminiResource(): @@ -147,9 +147,7 @@ class GeminiResource(): # NB: this intentionally does NOT fetch the normalized URL, because that could # cause an infinite loop with, e.g., normalization stripping a trailing slash # and a server redirecting to the same URL _with_ a trailing slash. - response = gusmobile.fetch(self.fully_qualified_url) - self.response = response - return self.response + return gusmobile.fetch(self.fully_qualified_url) def _get_normalized_url_and_host(self): @@ -160,16 +158,14 @@ class GeminiResource(): return url_normalized, host_normalized - def extract_contained_resources(self): + def extract_contained_resources(self, content): # this finds all gemini URLs within the content of a given GeminiResource and - # returns them as a list of new, unfetched GeminiResources - if not self.response: - pass + # returns them as a list of new GeminiResources if self.contained_resources: return self.contained_resources link_pattern = "^=>\s*(\S+)" - probable_urls = re.findall(link_pattern, self.response.content, re.MULTILINE) + probable_urls = re.findall(link_pattern, content, re.MULTILINE) resources = [] for url in probable_urls: resource = GeminiResource(