geminispace.info

gemini search engine
git clone https://git.clttr.info/geminispace.info.git
Log (Feed) | Files | Refs (Tags) | README | LICENSE

commit c562254a65a505dc9f5e5c54cb8ee6be810c644a
parent 6ea586ae2380fcdedf5c83d5ee99438370390cad
Author: René Wagner <rwa@clttr.info>
Date:   Fri,  6 Jan 2023 11:55:41 +0100

fix crawl, revamp crawl & indexing procedure

Diffstat:
MREADME.md | 4+---
Mgus/crawl.py | 7++++++-
Minfra/filter_seed_request.sh | 6+++---
Dinfra/gus-crawl.service | 13-------------
Dinfra/gus-index.service | 13-------------
Ainfra/update_index.sh | 6++++++
6 files changed, 16 insertions(+), 33 deletions(-)

diff --git a/README.md b/README.md @@ -38,9 +38,7 @@ Now you'll have created `index.new` directory, rename it to `index`. ### Running the crawl & indexer in production with systemd -1. update `infra/gus-crawl.service` & `infra/gus-index.service` to match your needs (directory, user) -2. copy both files to `/etc/systemd/system/` -3. set up a cron job for root with the following params: `0 9 */3 * * systemctl start gus-crawl --no-block` +3. set up a cron job with the following params: `0 9 * * * /home/gus/infra/update-index.sh /home/gus` ## Running the test suite diff --git a/gus/crawl.py b/gus/crawl.py @@ -514,8 +514,13 @@ def crawl_page( def load_expired_urls(): - expired_pages = Page.select(Page.ur).where(last_crawl_at < (datetime.now() + timedelta(hours=(Page.change_frequency * -1))) & Page.last_crawl_at.is_null(True)) + expired_pages = Page.raw( + """SELECT p.url + FROM page as p + WHERE datetime(last_crawl_at, REPLACE('fnord hours', 'fnord', change_frequency)) < datetime('now') OR last_crawl_at IS NULL""" ) return [page.url for page in expired_pages.execute()] +# expired_pages = Page.select(Page.url).where(Page.last_crawl_at < (datetime.now() - timedelta(hours=Page.change_frequency)) & Page.last_crawl_at.is_null(True)) +# return expired_pages def load_seed_request_urls(): with open("seed-requests.txt") as f: diff --git a/infra/filter_seed_request.sh b/infra/filter_seed_request.sh @@ -1,3 +1,3 @@ -cd /home/gus/ -cat seed-requests.txt | grep -iE '^gemini:\/\/[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,10}\b[-a-zA-Z0-9()!@:%_\+.~#?&\/\/=]*$' | sort | uniq > seed-requests.txt2 -mv seed-requests.txt2 seed-requests.txt +BASEDIR=${1} +cat ${BASEDIR}/seed-requests.txt | grep -iE '^gemini:\/\/[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,10}\b[-a-zA-Z0-9()!@:%_\+.~#?&\/\/=]*$' | sort | uniq > ${BASEDIR}/seed-requests.txt2 +mv ${BASEDIR}/seed-requests.txt2 ${BASEDIR}/seed-requests.txt diff --git a/infra/gus-crawl.service b/infra/gus-crawl.service @@ -1,13 +0,0 @@ -# /etc/systemd/system/gus.service - -[Unit] -Description=Gemini Universal Search - Crawler - -[Service] -User=gus -Group=gus -Type=oneshot -WorkingDirectory=/home/gus/ -Environment="PYTHONUNBUFFERED=1" -ExecStart=/home/gus/.poetry/bin/poetry run crawl -ExecStopPost=sudo systemctl start gus-index --no-block diff --git a/infra/gus-index.service b/infra/gus-index.service @@ -1,13 +0,0 @@ -# /etc/systemd/system/gus.service - -[Unit] -Description=Gemini Universal Search - Indexer - -[Service] -User=gus -Group=gus -Type=oneshot -WorkingDirectory=/home/gus -Environment="PYTHONUNBUFFERED=1" -ExecStart=/home/gus/.poetry/bin/poetry run build_index -ExecStopPost=sudo systemctl restart gus diff --git a/infra/update_index.sh b/infra/update_index.sh @@ -0,0 +1,6 @@ +BASEDIR=${1} +rm ${BASEDIR}/logs/gus.log +${BASEDIR}/infra/filter_seed_request.sh ${BASEDIR} +${BASEDIR}/.poetry/bin/poetry run crawl +${BASEDIR}/.poetry/bin/poetry run build_index +sudo systemctl restart gus