commit ac4aa302ec3d2cc6af7f52682aa955fd0e2b7fb9 parent c10da9f7bfe5ef7395f1b679e91bf329073439ff Author: Natalie Pendragon <natpen@natpen.net> Date: Fri, 5 Jun 2020 07:35:12 -0400 Add easy CLI way of removing domains from index Diffstat:
A | gus/remove_domain.py | | | 40 | ++++++++++++++++++++++++++++++++++++++++ |
M | pyproject.toml | | | 1 | + |
2 files changed, 41 insertions(+), 0 deletions(-)
diff --git a/gus/remove_domain.py b/gus/remove_domain.py @@ -0,0 +1,40 @@ +import sys + +from whoosh.qparser import QueryParser +from whoosh.index import open_dir + +def main(): + if len(sys.argv) < 2: + print("Please specify a domain...") + return + + ix = open_dir("index") + with ix.searcher() as searcher: + query_parser = QueryParser("domain", ix.schema) + query = query_parser.parse(sys.argv[1]) + results = searcher.search(query, limit=None) + + if len(results) == 0: + print("No documents found for domain.") + return + + # confirm removal before proceeding + print("Documents facing removal") + print("------------------------") + for result in results: + print(result["url"]) + answer = input("\nPlease confirm removal [y/n]:") + if answer.lower()[0] != "y": + print("Aborting removal.") + return + docnums = [result.docnum for result in results] + + with ix.writer() as writer: + for docnum in docnums: + writer.delete_document(docnum) + + print("{} documents removed from index.".format(len(results))) + + +if __name__ == "__main__": + main() diff --git a/pyproject.toml b/pyproject.toml @@ -23,5 +23,6 @@ build-backend = "poetry.masonry.api" [tool.poetry.scripts] crawl = "gus.crawl:main" search_index = "gus.search_index:main" +remove_domain = "gus.remove_domain:main" serve = "gus.serve:main" statistics = "gus.lib.index_statistics:run_index_statistics"