geminispace.info

gemini search engine
git clone https://git.clttr.info/geminispace.info.git
Log (Feed) | Files | Refs (Tags) | README | LICENSE

commit ac4aa302ec3d2cc6af7f52682aa955fd0e2b7fb9
parent c10da9f7bfe5ef7395f1b679e91bf329073439ff
Author: Natalie Pendragon <natpen@natpen.net>
Date:   Fri,  5 Jun 2020 07:35:12 -0400

Add easy CLI way of removing domains from index

Diffstat:
Agus/remove_domain.py | 40++++++++++++++++++++++++++++++++++++++++
Mpyproject.toml | 1+
2 files changed, 41 insertions(+), 0 deletions(-)

diff --git a/gus/remove_domain.py b/gus/remove_domain.py @@ -0,0 +1,40 @@ +import sys + +from whoosh.qparser import QueryParser +from whoosh.index import open_dir + +def main(): + if len(sys.argv) < 2: + print("Please specify a domain...") + return + + ix = open_dir("index") + with ix.searcher() as searcher: + query_parser = QueryParser("domain", ix.schema) + query = query_parser.parse(sys.argv[1]) + results = searcher.search(query, limit=None) + + if len(results) == 0: + print("No documents found for domain.") + return + + # confirm removal before proceeding + print("Documents facing removal") + print("------------------------") + for result in results: + print(result["url"]) + answer = input("\nPlease confirm removal [y/n]:") + if answer.lower()[0] != "y": + print("Aborting removal.") + return + docnums = [result.docnum for result in results] + + with ix.writer() as writer: + for docnum in docnums: + writer.delete_document(docnum) + + print("{} documents removed from index.".format(len(results))) + + +if __name__ == "__main__": + main() diff --git a/pyproject.toml b/pyproject.toml @@ -23,5 +23,6 @@ build-backend = "poetry.masonry.api" [tool.poetry.scripts] crawl = "gus.crawl:main" search_index = "gus.search_index:main" +remove_domain = "gus.remove_domain:main" serve = "gus.serve:main" statistics = "gus.lib.index_statistics:run_index_statistics"