geminispace.info

gemini search engine
git clone https://git.clttr.info/geminispace.info.git
Log (Feed) | Files | Refs (Tags) | README | LICENSE

excludes.py (6934B)


      1 # These are checked against normalized_url, so they should be
      2 # prepended with the gemini:// protocol, be all lowercased, and
      3 # not have the port specified if it is 1965.
      4 EXCLUDED_URL_PREFIXES = [
      5     "gemini://localhost",
      6     "gemini://example.org",
      7     "gemini://example.com",
      8     "gemini://www.youtube.com/",
      9     # LEO generating useless URIs
     10     "gemini://tilde.team/~khuxkm/leo/",
     11     # all combinations of a tictactoe board
     12     "gemini://tictactoe.lanterne.chilliet.eu",
     13 
     14     "gemini://kennedy.gemi.dev/",
     15     "gemini://gemi.dev/cgi-bin/",
     16     "gemini://auragem.space/texts/jewish",
     17     "gemini://auragem.space/twitch/",
     18     # serving big files and slooow capsule -> takes to long to crawl
     19     "gemini://kamalatta.ddnss.de/",
     20     "gemini://tweek.zyxxyz.eu/valentina/",
     21 
     22     # ASCII art with emulated modem speed
     23     "gemini://ansi.hrtk.in/",
     24     "gemini://matrix.kiwifarms.net",
     25 
     26     # ZachDeCooks songs
     27     "gemini://songs.zachdecook.com/song.gmi.php/",
     28     "gemini://songs.zachdecook.com/chord.svg/",
     29     "gemini://gemini.zachdecook.com/cgi-bin/ccel.sh",
     30 
     31     # kwiecien gemcast
     32     "gemini://kwiecien.us/gemcast/",
     33 
     34     # breaks crawl due to recursion overflow
     35     "gemini://cadence.moe/chapo/",
     36 
     37     "gemini://nixo.xyz/reply/", 
     38     "gemini://nixo.xyz/notify",
     39     "gemini://gemini.thebackupbox.net/queryresponse",
     40     "gemini://gemini.thebackupbox.net/cgi-bin/",
     41     "gemini://gem.garichankar.com/share_audio", 
     42 
     43     # Mastodon mirrors
     44     "gemini://vps01.rdelaage.ovh/",
     45     "gemini://mastogem.picasoft.net/",
     46     "gemini://mastogem.remorse.us/",
     47 
     48     # various failing resources on runjimmyrunrunyoufuckerrun.com
     49     "gemini://runjimmyrunrunyoufuckerrun.com/fonts/",
     50     "gemini://runjimmyrunrunyoufuckerrun.com/tmp/",
     51 
     52     # Search providers 
     53     "gemini://houston.coder.town/search?",
     54     "gemini://houston.coder.town/search/",
     55 	"gemini://marginalia.nu/search",
     56     "gemini://geminispace.info",
     57     "gemini://tlgs.one/",
     58     "gemini://gus.guru/",
     59 
     60     # Geddit
     61     "gemini://geddit.pitr.ca/post?",
     62     "gemini://geddit.pitr.ca/c/",
     63     "gemini://geddit.glv.one/post?",
     64     "gemini://geddit.glv.one/c/",
     65 
     66     # Marmaladefoo calculator
     67     "gemini://gemini.marmaladefoo.com/cgi-bin/calc.cgi?",
     68     "gemini://gemini.circumlunar.space/users/fgaz/calculator/",
     69 
     70     # Individual weather pages
     71     "gemini://acidic.website/cgi-bin/weather.tcl?",
     72     "gemini://caolan.uk/weather/",
     73 
     74     # Alex Schroeder's problematic stuff
     75     "gemini://alexschroeder.ch/image_external",
     76     "gemini://alexschroeder.ch/html/",
     77     "gemini://alexschroeder.ch/diff/",
     78     "gemini://alexschroeder.ch/history/",
     79     "gemini://alexschroeder.ch/http",
     80     "gemini://alexschroeder.ch/https",
     81     "gemini://alexschroeder.ch/tag/",
     82     "gemini://alexschroeder.ch/raw/",
     83     "gemini://alexschroeder.ch/map/",
     84     "gemini://alexschroeder.ch/do/comment",
     85     "gemini://alexschroeder.ch/do/rc",
     86     "gemini://alexschroeder.ch/do/rss",
     87     "gemini://alexschroeder.ch/do/new",
     88     "gemini://alexschroeder.ch/do/more",
     89     "gemini://alexschroeder.ch/do/tags",
     90     "gemini://alexschroeder.ch/do/match",
     91     "gemini://alexschroeder.ch/do/search",
     92     "gemini://alexschroeder.ch/do/gallery/",
     93 
     94     # mozz mailing list linkscraper 
     95     "gemini://mozz.us/files/gemini-links.gmi",
     96     "gemini://gem.benscraft.info/mailing-list",
     97     # gemini.techrights.org
     98     "gemini://gemini.techrights.org/",
     99 
    100     # endless stream
    101     "gemini://202x.moe/resonance",
    102 
    103     # big file
    104     "gemini://mirrors.apple2.org.za/active/ftp.apple.asimov.net/",
    105 
    106     # hackernews mirror
    107     "gemini://gem.graypegg.com/hn/",
    108     # antenna filters
    109     "gemini://warmedal.se/~antenna/filter",
    110 
    111     # youtube mirror
    112     "gemini://auragem.space/cgi-bin/youtube.cgi?",
    113     "gemini://auragem.space/youtube/",
    114     
    115 	# news mirrors - not our business
    116     "gemini://teapot.styx.org",
    117     "gemini://taz.de/",
    118     "gemini://gemini.knusbaum.com/feeds",
    119     "gemini://guardian.shit.cx/",
    120     "gemini://simplynews.metalune.xyz",
    121     "gemini://illegaldrugs.net/cgi-bin/news.php",
    122     "gemini://illegaldrugs.net/cgi-bin/reader",
    123     "gemini://illegaldrugs.net:1965/cgi-bin/reader",
    124     "gemini://rawtext.club/~sloum/geminews",
    125     "gemini://gemini.cabestan.tk/hn",
    126     "gemini://hn.filiuspatris.net/",
    127     "gemini://schmittstefan.de/de/nachrichten/",
    128     "gemini://gmi.noulin.net/mobile",
    129     "gemini://jpfox.fr/rss/",
    130     "gemini://dw.schettler.net/",
    131 	"gemini://dioskouroi.xyz/top",
    132 	"gemini://drewdevault.com/cgi-bin/hn.py",
    133 	"gemini://tobykurien.com/maverick/",
    134     "gemini://news.manuceau.net/",
    135     "gemini://gemini-news.com/",
    136     "gemini://news.tuxmachines.org/",
    137     "gemini://musicdir.zachdecook.com/",
    138     "gemini://federal.cx/news",
    139     "gemini://kypan.me/cgi",
    140     
    141 	# wikipedia proxy
    142     "gemini://wp.pitr.ca/",
    143     "gemini://wp.glv.one/",
    144     "gemini://wikipedia.geminet.org/",
    145 	"gemini://wikipedia.geminet.org:1966",
    146     "gemini://vault.transjovian.org/",
    147     
    148     # client torture test
    149     "gemini://egsam.pitr.ca/",
    150     "gemini://egsam.glv.one/",
    151     "gemini://gemini.conman.org/test",
    152 
    153     # mozz's chat
    154     "gemini://chat.mozz.us/stream",
    155     "gemini://chat.mozz.us/submit",
    156 
    157     # gempod
    158     "gemini://rocketcaster.xyz/share/",
    159 
    160     # gopher proxy
    161     "gemini://80h.dev/agena/",
    162 
    163     # astrobotany
    164     "gemini://astrobotany.mozz.us/",
    165     "gemini://carboncopy.xyz/cgi-bin/apache.gex/",
    166 
    167     # infinite maze
    168     "gemini://alexey.shpakovsky.ru/maze",
    169 
    170     # susa.net
    171     "gemini://gemini.susa.net/cgi-bin/search?",
    172     "gemini://gemini.susa.net/cgi-bin/twitter?",
    173     "gemini://gemini.susa.net/cgi-bin/vim-search?",
    174     "gemini://gemini.susa.net/cgi-bin/links_stu.lua?",
    175 
    176     "gemini://gemini.spam.works/textfiles/",
    177     "gemini://gemini.spam.works/mirrors/textfiles/",
    178     "gemini://gemini.spam.works/users/dvn/archive/",
    179 
    180     # streams that never end...
    181     "gemini://gemini.thebackupbox.net/radio",
    182     "gemini://higeki.jp/radio",
    183 
    184     # full web proxy
    185     "gemini://webgate.geminet.org/",
    186     "gemini://drewdevault.com/cgi-bin/web.sh?",
    187 	"gemini://gemiprox.pollux.casa/",
    188     "gemini://gemiprox.pollux.casa:1966",
    189     "gemini://ecs.d2evs.net/proxy/",
    190 
    191 	# killing crawl, I think maybe because it's too big
    192 	# cryptocurrency bullshit
    193     "gemini://gem.denarii.cloud/",
    194 
    195     # docs - not our business
    196     "gemini://cfdocs.wetterberg.nu/",
    197     "gemini://godocs.io",
    198 
    199     "gemini://musicbrainz.uploadedlobster.com/",
    200 
    201     # git repos
    202     "gemini://git.skyjake.fi",
    203     "gemini://gemini.unlimited.pizza/git",
    204     "gemini://r.bdr.sh/git", 
    205     # games
    206     "gemini://jsreed5.org/live/",
    207     "gemini://gemini.thegonz.net/ski",
    208     "gemini://gemini.thegonz.net/gemski",
    209     "gemini://thegonz.net/",
    210     "gemini://gemlog.stargrave.org/",
    211     # NOULIN
    212     "gemini://gmi.noulin.net/stackoverflow/",
    213     "gemini://gmi.noulin.net/gitRepositories/",
    214     "gemini://gmi.noulin.net/man/",
    215 ]
    216 
    217 EXCLUDED_URL_PATHS = [
    218     "favicon.ico",
    219     "favicon.txt",
    220     "robots.txt",
    221     "rss.txt",
    222     "rss.xml",
    223 ]