geminispace.info

gemini search engine
git clone https://git.clttr.info/geminispace.info.git
Log (Feed) | Files | Refs (Tags) | README | LICENSE

commit 8994b21fea2d7adf1ababecfae27971ff8390fb1
parent 97b15eaa87fcd5ba7604fd540e64767fdfe6b04c
Author: Natalie Pendragon <natpen@natpen.net>
Date:   Tue, 19 May 2020 17:08:53 -0400

[crawl] Fix lots of bugs

Diffstat:
Mgus/crawl.py | 78++++++++++++++++++++++++++++++++++++++++++++++--------------------------------
Agus/lib/domain.py | 6++++++
Mgus/lib/gemini.py | 46++++++++++++++++++++++++----------------------
Mpoetry.lock | 97+++++++++++++++++++++++++++++++++++++++----------------------------------------
4 files changed, 124 insertions(+), 103 deletions(-)

diff --git a/gus/crawl.py b/gus/crawl.py @@ -26,8 +26,9 @@ from gus.lib.gemini import GeminiResource uses_relative.append("gemini") uses_netloc.append("gemini") -INDEX_DIR = "index" -INDEX_STORAGE = FileStorage(INDEX_DIR) +INDEX_DIR_CURRENT = "index" +INDEX_DIR_BACKUP = INDEX_DIR_CURRENT + ".bak" +INDEX_DIR_NEW = INDEX_DIR_CURRENT + ".new" SEED_URLS = [ # English @@ -93,6 +94,12 @@ EXCLUDED_URL_PREFIXES = [ "gemini://gus.guru/add-seed?", ] +EXCLUDED_URL_PATHS = [ + "/atom.xml", + "/robots.txt", + "/rss.txt", +] + def backup_old_index(index_dir, backup_dir): last_index_modification_time = datetime.fromtimestamp(os.path.getmtime(index_dir)) @@ -104,8 +111,7 @@ def backup_old_index(index_dir, backup_dir): def create_index(index_dir): - backup_old_index(index_dir, index_dir + ".bak") - shutil.rmtree(index_dir) + shutil.rmtree(index_dir, ignore_errors=True) pathlib.Path(index_dir).mkdir(parents=True, exist_ok=True) schema = Schema( url=TEXT( @@ -128,12 +134,12 @@ def create_index(index_dir): stored=True ), ) - INDEX_STORAGE.create_index(schema) + index_storage.create_index(schema) def index_binary(response): print("INDEXING BINARY...") - index_writer = INDEX_STORAGE.open_index().writer() + index_writer = index_storage.open_index().writer() try: index_writer.add_document( url=response.url, @@ -147,7 +153,7 @@ def index_binary(response): def index_prompt(response): print("INDEXING PROMPT...") - index_writer = INDEX_STORAGE.open_index().writer() + index_writer = index_storage.open_index().writer() try: index_writer.add_document( url=response.url, @@ -162,7 +168,7 @@ def index_prompt(response): def index_content(response): print("INDEXING CONTENT...") - index_writer = INDEX_STORAGE.open_index().writer() + index_writer = index_storage.open_index().writer() try: index_writer.add_document( url=response.url, @@ -197,8 +203,13 @@ def crawl(gemini_resource): return gr = gemini_resource - for url_prefix in EXCLUDED_URL_PREFIXES: - if gr.normalized_url.startswith(url_prefix): + for excluded_prefix in EXCLUDED_URL_PREFIXES: + if gr.normalized_url.startswith(excluded_prefix): + print("MANUAL EXCLUSION SKIP : %s" % gr.fully_qualified_url) + print("--------------------------") + return + for excluded_path in EXCLUDED_URL_PATHS: + if gr.urlsplit.path.lower() == excluded_path: print("MANUAL EXCLUSION SKIP : %s" % gr.fully_qualified_url) print("--------------------------") return @@ -233,6 +244,9 @@ def crawl(gemini_resource): domain_hit_timings[gr.normalized_host] = datetime.now() # Actually fetch! + print("Fetching {}".format(gr.fully_qualified_url)) + if gr.fully_qualified_parent_url is not None: + print("With parent {}".format(gr.fully_qualified_parent_url)) r = gr.fetch() if r is None: @@ -243,15 +257,10 @@ def crawl(gemini_resource): elif r.status.startswith("3"): # redirect status print("REDIRECT : %s -> %s" % (gr.fully_qualified_url, r.url)) - # NB: this pop is necessary because if the redirect is a change to the URL - # structure of, essentially, the same URL (e.g., like the addition or removal - # of a trailing slash), then the crawl of the redirect would think it had - # already seen this resource in visited_urls' normalized source of truth. - visited_urls.pop() crawl_statistics["redirect_count"] += 1 # if is_nontrivial_redirect(gr.fully_qualified_url, r.url): # crawl_statistics["redirect_nontrivial_count"] += 1 - redirect_resource = GeminiResource(r.url) + redirect_resource = GeminiResource(r.url, gr.normalized_url, gr.normalized_host) crawl(redirect_resource) elif r.status.startswith("1"): # input status @@ -284,12 +293,13 @@ def is_nontrivial_redirect(url, redirect_url): return url.rstrip() != redirect_url.rstrip() -def load_visited_urls(): - return pickle.load( open( "visited_urls.p", "rb" ) ) - - -def persist_visited_urls(visited_urls): - pickle.dump( visited_urls, open( "visited_urls.p", "wb" ) ) +def load_visited_urls(index_dir): + visited_urls = [] + ix = open_dir(index_dir) + with ix.reader() as reader: + all_stored_fields = reader.all_stored_fields() + visited_urls = [GeminiResource(f["url"]).normalized_url for f in all_stored_fields] + return visited_urls def load_seed_request_urls(): @@ -302,15 +312,17 @@ def load_seed_request_urls(): def run_crawl(should_run_destructive=False, seed_urls=[]): # TODO: track failed domain/page attempts, and don't reattempt for 15mins + + global index_dir + index_dir = INDEX_DIR_NEW if should_run_destructive else INDEX_DIR_CURRENT + global index_storage + index_storage = FileStorage(index_dir) if should_run_destructive: - # TODO: backup previous pickle instead of deleting (should be storing - # all crawl state together somewhere) - if Path("visited_urls.p").is_file(): - os.remove("visited_urls.p") - create_index(INDEX_DIR) + # backup_old_index(INDEX_DIR_CURRENT, INDEX_DIR_BACKUP) + create_index(index_dir) global visited_urls - visited_urls = [] if should_run_destructive else load_visited_urls() + visited_urls = [] if should_run_destructive else load_visited_urls(INDEX_DIR_CURRENT) global robot_file_map robot_file_map = {} global domain_hit_timings @@ -335,11 +347,13 @@ def run_crawl(should_run_destructive=False, seed_urls=[]): for resource in seed_request_resources: crawl(resource) - persist_visited_urls(visited_urls) - - index_statistics = compute_index_statistics("index") + index_statistics = compute_index_statistics(index_dir) print_index_statistics(index_statistics, crawl_statistics) - persist_statistics(index_statistics, crawl_statistics, "statistics.csv") + if should_run_destructive: + persist_statistics(index_statistics, crawl_statistics, "statistics.csv") + # replace current index with new index + shutil.rmtree(INDEX_DIR_CURRENT, ignore_errors=True) + shutil.move(INDEX_DIR_NEW, INDEX_DIR_CURRENT) def main(): diff --git a/gus/lib/domain.py b/gus/lib/domain.py @@ -0,0 +1,6 @@ +import re + +def is_domain(possible_domain): + domain_pattern = "^((?=[a-z0-9-]{1,63}\.)(xn--)?[a-z0-9]+(-[a-z0-9]+)*\.)+(aaa|aarp|abarth|abb|abbott|abbvie|abc|able|abogado|abudhabi|ac|academy|accenture|accountant|accountants|aco|active|ac|or|ad|adac|ads|adult|ae|aeg|aero|aetna|af|afamilycompany|afl|africa|ag|agakhan|agency|ai|aig|aigo|airbus|airforce|airtel|akdn|al|alfaromeo|alibaba|alipay|allfinanz|allstate|ally|alsace|alstom|am|americanexpress|americanfamily|amex|amfam|amica|amsterdam|an|analytics|android|anquan|anz|ao|aol|apartments|app|apple|aq|aquarelle|ar|arab|aramco|archi|army|arpa|art|arte|as|asda|asia|associates|at|athleta|attorney|au|auction|audi|audible|audio|auspost|author|auto|autos|avianca|aw|aws|ax|axa|az|azure|ba|baby|baidu|banamex|bananarepublic|band|bank|bar|barcelona|barclaycard|barclays|barefoot|bargains|baseball|basketball|bauhaus|bayern|bb|bbc|bbt|bbva|bcg|bcn|bd|be|beats|beauty|beer|bentley|berlin|best|bestbuy|bet|bf|bg|bh|bharti|bi|bible|bid|bike|bing|bingo|bio|biz|bj|bl|black|blackfriday|blanco|blockbuster|blog|bloomberg|blue|bm|bms|bmw|bn|bnl|bnpparibas|bo|boats|boehringer|bofa|bom|bond|boo|book|booking|boots|bosch|bostik|boston|bot|boutique|box|bq|br|bradesco|bridgestone|broadway|broker|brother|brussels|bs|bt|budapest|bugatti|build|builders|business|buy|buzz|bv|bw|by|bz|bzh|ca|cab|cafe|cal|call|calvinklein|cam|camera|camp|cancerresearch|canon|capetown|capital|capitalone|car|caravan|cards|care|career|careers|cars|cartier|casa|case|caseih|cash|casino|cat|catering|catholic|cba|cbn|cbre|cbs|cc|cd|ceb|center|ceo|cern|cf|cfa|cfd|cg|ch|chanel|channel|charity|chase|chat|cheap|chintai|chloe|christmas|chrome|chrysler|church|ci|cipriani|circle|cisco|citadel|citi|citic|city|cityeats|ck|cl|claims|cleaning|click|clinic|clinique|clothing|cloud|club|clubmed|cm|cn|co|coach|codes|coffee|college|cologne|com|comcast|commbank|community|company|compare|computer|comsec|condos|construction|consulting|contact|contractors|cooking|cookingchannel|cool|coop|corsica|country|coupon|coupons|courses|cr|credit|creditcard|creditunion|cricket|crown|crs|cruise|cruises|csc|cu|cuisinella|cv|cw|cx|cy|cymru|cyou|cz|dabur|dad|dance|data|date|dating|datsun|day|dclk|dds|de|deal|dealer|deals|degree|delivery|dell|deloitte|delta|democrat|dental|dentist|desi|design|dev|dhl|diamonds|diet|digital|direct|directory|discount|discover|dish|diy|dj|dk|dm|dnp|do|docs|doctor|dodge|dog|doha|domains|doosan|dot|download|drive|dtv|dubai|duck|dunlop|duns|dupont|durban|dvag|dvr|dz|earth|eat|ec|eco|edeka|edu|education|ee|eg|eh|email|emerck|energy|engineer|engineering|enterprises|epost|epson|equipment|er|ericsson|erni|es|esq|estate|esurance|et|etisalat|eu|eurovision|eus|events|everbank|exchange|expert|exposed|express|extraspace|fage|fail|fairwinds|faith|family|fan|fans|farm|farmers|fashion|fast|fedex|feedback|ferrari|ferrero|fi|fiat|fidelity|fido|film|final|finance|financial|fire|firestone|firmdale|fish|fishing|fit|fitness|fj|fk|flickr|flights|flir|florist|flowers|flsmidth|fly|fm|fo|foo|food|foodnetwork|football|ford|forex|forsale|forum|foundation|fox|fr|free|fresenius|frl|frogans|frontdoor|frontier|ftr|fujitsu|fujixerox|fun|fund|furniture|futbol|fyi|ga|gal|gallery|gallo|gallup|game|games|gap|garden|gb|gbiz|gd|gdn|ge|gea|gent|genting|george|gf|gg|ggee|gh|gi|gift|gifts|gives|giving|gl|glade|glass|gle|global|globo|gm|gmail|gmbh|gmo|gmx|gn|godaddy|gold|goldpoint|golf|goo|goodhands|goodyear|goog|google|gop|got|gov|gp|gq|gr|grainger|graphics|gratis|green|gripe|grocery|group|gs|gt|gu|guardian|gucci|guge|guide|guitars|guru|gw|gy|hair|hamburg|hangout|haus|hbo|hdfc|hdfcbank|health|healthcare|help|helsinki|here|hermes|hgtv|hiphop|hisamitsu|hitachi|hiv|hk|hkt|hm|hn|hockey|holdings|holiday|homedepot|homegoods|homes|homesense|honda|honeywell|horse|hospital|host|hosting|hot|hoteles|hotels|hotmail|house|how|hr|hsbc|ht|htc|hu|hughes|hyatt|hyundai|ibm|icbc|ice|icu|id|ie|ieee|ifm|iinet|ikano|il|im|imamat|imdb|immo|immobilien|in|industries|infiniti|info|ing|ink|institute|insurance|insure|int|intel|international|intuit|investments|io|ipiranga|iq|ir|irish|is|iselect|ismaili|ist|istanbul|it|itau|itv|iveco|iwc|jaguar|java|jcb|jcp|je|jeep|jetzt|jewelry|jio|jlc|jll|jm|jmp|jnj|jo|jobs|joburg|jot|joy|jp|jpmorgan|jprs|juegos|juniper|kaufen|kddi|ke|kerryhotels|kerrylogistics|kerryproperties|kfh|kg|kh|ki|kia|kim|kinder|kindle|kitchen|kiwi|km|kn|koeln|komatsu|kosher|kp|kpmg|kpn|kr|krd|kred|kuokgroup|kw|ky|kyoto|kz|la|lacaixa|ladbrokes|lamborghini|lamer|lancaster|lancia|lancome|land|landrover|lanxess|lasalle|lat|latino|latrobe|law|lawyer|lb|lc|lds|lease|leclerc|lefrak|legal|lego|lexus|lgbt|li|liaison|lidl|life|lifeinsurance|lifestyle|lighting|like|lilly|limited|limo|lincoln|linde|link|lipsy|live|living|lixil|lk|llc|loan|loans|locker|locus|loft|lol|london|lotte|lotto|love|lpl|lplfinancial|lr|ls|lt|ltd|ltda|lu|lundbeck|lupin|luxe|luxury|lv|ly|ma|macys|madrid|maif|maison|makeup|man|management|mango|map|market|marketing|markets|marriott|marshalls|maserati|mattel|mba|mc|mcd|mcdonalds|mckinsey|md|me|med|media|meet|melbourne|meme|memorial|men|menu|meo|merckmsd|metlife|mf|mg|mh|miami|microsoft|mil|mini|mint|mit|mitsubishi|mk|ml|mlb|mls|mm|mma|mn|mo|mobi|mobile|mobily|moda|moe|moi|mom|monash|money|monster|montblanc|mopar|mormon|mortgage|moscow|moto|motorcycles|mov|movie|movistar|mp|mq|mr|ms|msd|mt|mtn|mtpc|mtr|mu|museum|mutual|mutuelle|mv|mw|mx|my|mz|na|nab|nadex|nagoya|name|nationwide|natura|navy|nba|nc|ne|nec|net|netbank|netflix|network|neustar|new|newholland|news|next|nextdirect|nexus|nf|nfl|ng|ngo|nhk|ni|nico|nike|nikon|ninja|nissan|nissay|nl|no|nokia|northwesternmutual|norton|now|nowruz|nowtv|np|nr|nra|nrw|ntt|nu|nyc|nz|obi|observer|off|office|okinawa|olayan|olayangroup|oldnavy|ollo|om|omega|one|ong|onl|online|onyourside|ooo|open|oracle|orange|org|organic|orientexpress|origins|osaka|otsuka|ott|ovh|pa|page|pamperedchef|panasonic|panerai|paris|pars|partners|parts|party|passagens|pay|pccw|pe|pet|pf|pfizer|pg|ph|pharmacy|phd|philips|phone|photo|photography|photos|physio|piaget|pics|pictet|pictures|pid|pin|ping|pink|pioneer|pizza|pk|pl|place|play|playstation|plumbing|plus|pm|pn|pnc|pohl|poker|politie|porn|post|pr|pramerica|praxi|press|prime|pro|prod|productions|prof|progressive|promo|properties|property|protection|pru|prudential|ps|pt|pub|pw|pwc|py|qa|qpon|quebec|quest|qvc|racing|radio|raid|re|read|realestate|realtor|realty|recipes|red|redstone|redumbrella|rehab|reise|reisen|reit|reliance|ren|rent|rentals|repair|report|republican|rest|restaurant|review|reviews|rexroth|rich|richardli|ricoh|rightathome|ril|rio|rip|rmit|ro|rocher|rocks|rodeo|rogers|room|rs|rsvp|ru|rugby|ruhr|run|rw|rwe|ryukyu|sa|saarland|safe|safety|sakura|sale|salon|samsclub|samsung|sandvik|sandvikcoromant|sanofi|sap|sapo|sarl|sas|save|saxo|sb|sbi|sbs|sc|sca|scb|schaeffler|schmidt|scholarships|school|schule|schwarz|science|scjohnson|scor|scot|sd|se|search|seat|secure|security|seek|select|sener|services|ses|seven|sew|sex|sexy|sfr|sg|sh|shangrila|sharp|shaw|shell|shia|shiksha|shoes|shop|shopping|shouji|show|showtime|shriram|si|silk|sina|singles|site|sj|sk|ski|skin|sky|skype|sl|sling|sm|smart|smile|sn|sncf|so|soccer|social|softbank|software|sohu|solar|solutions|song|sony|soy|space|spiegel|sport|spot|spreadbetting|sr|srl|srt|ss|st|stada|staples|star|starhub|statebank|statefarm|statoil|stc|stcgroup|stockholm|storage|store|stream|studio|study|style|su|sucks|supplies|supply|support|surf|surgery|suzuki|sv|swatch|swiftcover|swiss|sx|sy|sydney|symantec|systems|sz|tab|taipei|talk|taobao|target|tatamotors|tatar|tattoo|tax|taxi|tc|tci|td|tdk|team|tech|technology|tel|telecity|telefonica|temasek|tennis|teva|tf|tg|th|thd|theater|theatre|tiaa|tickets|tienda|tiffany|tips|tires|tirol|tj|tjmaxx|tjx|tk|tkmaxx|tl|tm|tmall|tn|to|today|tokyo|tools|top|toray|toshiba|total|tours|town|toyota|toys|tp|tr|trade|trading|training|travel|travelchannel|travelers|travelersinsurance|trust|trv|tt|tube|tui|tunes|tushu|tv|tvs|tw|tz|ua|ubank|ubs|uconnect|ug|uk|um|unicom|university|uno|uol|ups|us|uy|uz|va|vacations|vana|vanguard|vc|ve|vegas|ventures|verisign|versicherung|vet|vg|vi|viajes|video|vig|viking|villas|vin|vip|virgin|visa|vision|vista|vistaprint|viva|vivo|vlaanderen|vn|vodka|volkswagen|volvo|vote|voting|voto|voyage|vu|vuelos|wales|walmart|walter|wang|wanggou|warman|watch|watches|weather|weatherchannel|webcam|weber|website|wed|wedding|weibo|weir|wf|whoswho|wien|wiki|williamhill|win|windows|wine|winners|wme|wolterskluwer|woodside|work|works|world|wow|ws|wtc|wtf|xbox|xerox|xfinity|xihuan|xin|测试|कॉम|परीक्षा|セール|佛山|ಭಾರತ|慈善|集团|在线|한국|ଭାରତ|大众汽车|点看|คอม|ভাৰত|ভারত|八卦|‏موقع‎|বাংলা|公益|公司|香格里拉|网站|移动|我爱你|москва|испытание|қаз|католик|онлайн|сайт|联通|срб|бг|бел|‏קום‎|时尚|微博|테스트|淡马锡|ファッション|орг|नेट|ストア|삼성|சிங்கப்பூர்|商标|商店|商城|дети|мкд|‏טעסט‎|ею|ポイント|新闻|工行|家電|‏كوم‎|中文网|中信|中国|中國|娱乐|谷歌|భారత్|ලංකා|電訊盈科|购物|測試|クラウド|ભારત|通販|भारतम्|भारत|भारोत|‏آزمایشی‎|பரிட்சை|网店|संगठन|餐厅|网络|ком|укр|香港|诺基亚|食品|δοκιμή|飞利浦|‏إختبار‎|台湾|台灣|手表|手机|мон|‏الجزائر‎|‏عمان‎|‏ارامكو‎|‏ایران‎|‏العليان‎|‏اتصالات‎|‏امارات‎|‏بازار‎|‏موريتانيا‎|‏پاکستان‎|‏الاردن‎|‏موبايلي‎|‏بارت‎|‏بھارت‎|‏المغرب‎|‏ابوظبي‎|‏السعودية‎|‏ڀارت‎|‏كاثوليك‎|‏سودان‎|‏همراه‎|‏عراق‎|‏مليسيا‎|澳門|닷컴|政府|‏شبكة‎|‏بيتك‎|‏عرب‎|გე|机构|组织机构|健康|ไทย|‏سورية‎|招聘|рус|рф|珠宝|‏تونس‎|大拿|みんな|グーグル|ελ|世界|書籍|ഭാരതം|ਭਾਰਤ|网址|닷넷|コム|天主教|游戏|vermögensberater|vermögensberatung|企业|信息|嘉里大酒店|嘉里|‏مصر‎|‏قطر‎|广东|இலங்கை|இந்தியா|հայ|新加坡|‏فلسطين‎|テスト|政务|xperia|xxx|xyz|yachts|yahoo|yamaxun|yandex|ye|yodobashi|yoga|yokohama|you|youtube|yt|yun|za|zappos|zara|zero|zip|zippo|zm|zone|zuerich|zw)$" + domain_match = re.match(domain_pattern, possible_domain, re.I) + return domain_match diff --git a/gus/lib/gemini.py b/gus/lib/gemini.py @@ -3,19 +3,14 @@ from urllib.parse import unquote, urljoin, urlparse, urlsplit, urlunparse, urlun import gusmobile +from gus.lib.domain import is_domain + # hack(natpen): the built-in methods in urllib need to know the # Gemini protocol exists uses_relative.append("gemini") uses_netloc.append("gemini") - -def is_domain(possible_domain): - domain_pattern = "^((?=[a-z0-9-]{1,63}\.)(xn--)?[a-z0-9]+(-[a-z0-9]+)*\.)+[a-z]{2,63}$" - domain_match = re.match(domain_pattern, possible_domain, re.I) - return domain_match is not None - - -def urlsplit_featureful(url, parent_resource=None): +def urlsplit_featureful(url, fully_qualified_parent_url=None, parent_hostname=None): # the point of this relatively complex function is to allow for protocol-less, # double-slash-prepended-less URLs that still get treated as absolute (i.e., # non-relative) URLs and thus get their hosts parsed correctly by `urlsplit`. @@ -23,16 +18,15 @@ def urlsplit_featureful(url, parent_resource=None): # things behind the scenes. is_relative = False - url = url.strip().rstrip("/") u = urlsplit(url, 'gemini') if u.scheme != "gemini": return None, None if u.hostname is None: if url.startswith("/"): # process relative link - if parent_resource is None: - return None - joined = urljoin("gemini://{}".format(parent_resource.normalized_host), url) + if parent_hostname is None: + return None, None + joined = urljoin("gemini://{}".format(parent_hostname), url) u = urlsplit(joined, 'gemini') is_relative = True else: # url does not start with / @@ -46,20 +40,24 @@ def urlsplit_featureful(url, parent_resource=None): u = urlsplit(url, 'gemini') else: # process relative link - if parent_resource is None: - return None - joined = urljoin("gemini://{}".format(parent_resource.normalized_host), url) + if fully_qualified_parent_url is None: + return None, None + joined = urljoin(fully_qualified_parent_url, url) u = urlsplit(joined, 'gemini') is_relative = True return u, is_relative class GeminiResource(): - def __init__(self, url, parent_resource=None): + def __init__(self, url, fully_qualified_parent_url=None, parent_hostname=None): self.raw_url = url - self.parent_resource = parent_resource - self.urlsplit, self.is_relative = urlsplit_featureful(url, self.parent_resource) + self.urlsplit, self.is_relative = urlsplit_featureful( + url, + fully_qualified_parent_url=fully_qualified_parent_url, + parent_hostname=parent_hostname, + ) self.is_valid = self.urlsplit is not None + self.fully_qualified_parent_url = fully_qualified_parent_url self._normalized_url = None self._normalized_host = None self._fully_qualified_url = None @@ -87,7 +85,7 @@ class GeminiResource(): return None if self._fully_qualified_url is None: if self.is_relative: - url = self.normalized_url + url = urlunsplit(self.urlsplit) else: raw_url_lower = self.raw_url.lower() if raw_url_lower.startswith("gemini://"): @@ -114,11 +112,11 @@ class GeminiResource(): def _get_normalized_url_and_host(self): - url_normalized = urlunsplit(self.urlsplit) + url_normalized = self.fully_qualified_url.lower() if "%" in url_normalized: url_normalized = unquote(url_normalized) if self.urlsplit.port == 1965: - url_normalized = url_normalized.replace(u.hostname+":1965", u.hostname, 1) + url_normalized = url_normalized.replace(self.urlsplit.hostname.lower() + ":1965", self.urlsplit.hostname.lower(), 1) host_normalized = self.urlsplit.hostname.lower() return url_normalized, host_normalized @@ -135,7 +133,11 @@ class GeminiResource(): probable_urls = re.findall(link_pattern, self.response.content, re.MULTILINE) resources = [] for url in probable_urls: - resource = GeminiResource(url, parent_resource=self) + resource = GeminiResource( + url, + fully_qualified_parent_url=self.fully_qualified_url, + parent_hostname=self.urlsplit.hostname, + ) if resource.is_valid: resources.append(resource) self.contained_resources = resources diff --git a/poetry.lock b/poetry.lock @@ -4,7 +4,7 @@ description = "A small Python module for determining appropriate platform-specif name = "appdirs" optional = false python-versions = "*" -version = "1.4.3" +version = "1.4.4" [[package]] category = "dev" @@ -22,7 +22,7 @@ marker = "sys_platform == \"win32\"" name = "atomicwrites" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" -version = "1.3.0" +version = "1.4.0" [[package]] category = "dev" @@ -72,7 +72,7 @@ description = "Composable command line interface toolkit" name = "click" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" -version = "7.1.1" +version = "7.1.2" [[package]] category = "dev" @@ -100,7 +100,7 @@ python-versions = "*" version = "0.1.0" [package.source] -reference = "123c73b4e06c89781543dfcba55581d6a3931129" +reference = "2f7c902c5c9086a14e4d07b46a04f90c22520e17" type = "git" url = "https://git.sr.ht/~natpen/gusmobile" [[package]] @@ -125,7 +125,7 @@ description = "IPython: Productive Interactive Computing" name = "ipython" optional = false python-versions = ">=3.6" -version = "7.13.0" +version = "7.14.0" [package.dependencies] appnope = "*" @@ -141,7 +141,7 @@ setuptools = ">=18.5" traitlets = ">=4.2" [package.extras] -all = ["numpy (>=1.14)", "testpath", "notebook", "nose (>=0.10.1)", "nbconvert", "requests", "ipywidgets", "qtconsole", "ipyparallel", "Sphinx (>=1.3)", "pygments", "nbformat", "ipykernel"] +all = ["nose (>=0.10.1)", "Sphinx (>=1.3)", "testpath", "nbformat", "ipywidgets", "qtconsole", "numpy (>=1.14)", "notebook", "ipyparallel", "ipykernel", "pygments", "requests", "nbconvert"] doc = ["Sphinx (>=1.3)"] kernel = ["ipykernel"] nbconvert = ["nbconvert"] @@ -188,7 +188,7 @@ description = "More routines for operating on iterables, beyond itertools" name = "more-itertools" optional = false python-versions = ">=3.5" -version = "8.2.0" +version = "8.3.0" [[package]] category = "dev" @@ -196,7 +196,7 @@ description = "Core utilities for Python packages" name = "packaging" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" -version = "20.3" +version = "20.4" [package.dependencies] pyparsing = ">=2.0.2" @@ -307,7 +307,7 @@ description = "pytest: simple powerful testing with Python" name = "pytest" optional = false python-versions = ">=3.5" -version = "5.4.1" +version = "5.4.2" [package.dependencies] atomicwrites = ">=1.0" @@ -333,7 +333,7 @@ description = "Alternative regular expression module, to replace re." name = "regex" optional = false python-versions = "*" -version = "2020.4.4" +version = "2020.5.14" [[package]] category = "dev" @@ -349,7 +349,7 @@ description = "Python Library for Tom's Obvious, Minimal Language" name = "toml" optional = false python-versions = "*" -version = "0.10.0" +version = "0.10.1" [[package]] category = "dev" @@ -410,16 +410,16 @@ python-versions = "^3.7" [metadata.files] appdirs = [ - {file = "appdirs-1.4.3-py2.py3-none-any.whl", hash = "sha256:d8b24664561d0d34ddfaec54636d502d7cea6e29c3eaf68f3df6180863e2166e"}, - {file = "appdirs-1.4.3.tar.gz", hash = "sha256:9e5896d1372858f8dd3344faf4e5014d21849c756c8d5701f78f8a103b372d92"}, + {file = "appdirs-1.4.4-py2.py3-none-any.whl", hash = "sha256:a841dacd6b99318a741b166adb07e19ee71a274450e68237b4650ca1055ab128"}, + {file = "appdirs-1.4.4.tar.gz", hash = "sha256:7d5d0167b2b1ba821647616af46a749d1c653740dd0d2415100fe26e27afdf41"}, ] appnope = [ {file = "appnope-0.1.0-py2.py3-none-any.whl", hash = "sha256:5b26757dc6f79a3b7dc9fab95359328d5747fcb2409d331ea66d0272b90ab2a0"}, {file = "appnope-0.1.0.tar.gz", hash = "sha256:8b995ffe925347a2138d7ac0fe77155e4311a0ea6d6da4f5128fe4b3cbe5ed71"}, ] atomicwrites = [ - {file = "atomicwrites-1.3.0-py2.py3-none-any.whl", hash = "sha256:03472c30eb2c5d1ba9227e4c2ca66ab8287fbfbbda3888aa93dc2e28fc6811b4"}, - {file = "atomicwrites-1.3.0.tar.gz", hash = "sha256:75a9445bac02d8d058d5e1fe689654ba5a6556a1dfd8ce6ec55a0ed79866cfa6"}, + {file = "atomicwrites-1.4.0-py2.py3-none-any.whl", hash = "sha256:6d1784dea7c0c8d4a5172b6c620f40b6e4cbfdf96d783691f2e1302a7b88e197"}, + {file = "atomicwrites-1.4.0.tar.gz", hash = "sha256:ae70396ad1a434f9c7046fd2dd196fc04b12f9e91ffb859164193be8b6168a7a"}, ] attrs = [ {file = "attrs-19.3.0-py2.py3-none-any.whl", hash = "sha256:08a96c641c3a74e44eb59afb61a24f2cb9f4d7188748e76ba4bb5edfa3cb7d1c"}, @@ -434,8 +434,8 @@ black = [ {file = "black-19.10b0.tar.gz", hash = "sha256:c2edb73a08e9e0e6f65a0e6af18b059b8b1cdd5bef997d7a0b181df93dc81539"}, ] click = [ - {file = "click-7.1.1-py2.py3-none-any.whl", hash = "sha256:e345d143d80bf5ee7534056164e5e112ea5e22716bbb1ce727941f4c8b471b9a"}, - {file = "click-7.1.1.tar.gz", hash = "sha256:8a18b4ea89d8820c5d0c7da8a64b2c324b4dabb695804dbfea19b9be9d88c0cc"}, + {file = "click-7.1.2-py2.py3-none-any.whl", hash = "sha256:dacca89f4bfadd5de3d7489b7c8a566eee0d3676333fbb50030263894c38c0dc"}, + {file = "click-7.1.2.tar.gz", hash = "sha256:d2b5255c7c6349bc1bd1e59e08cd12acbbd63ce649f2588755783aa94dfb6b1a"}, ] colorama = [ {file = "colorama-0.4.3-py2.py3-none-any.whl", hash = "sha256:7d73d2a99753107a36ac6b455ee49046802e59d9d076ef8e47b61499fa29afff"}, @@ -451,8 +451,8 @@ importlib-metadata = [ {file = "importlib_metadata-1.6.0.tar.gz", hash = "sha256:34513a8a0c4962bc66d35b359558fd8a5e10cd472d37aec5f66858addef32c1e"}, ] ipython = [ - {file = "ipython-7.13.0-py3-none-any.whl", hash = "sha256:eb8d075de37f678424527b5ef6ea23f7b80240ca031c2dd6de5879d687a65333"}, - {file = "ipython-7.13.0.tar.gz", hash = "sha256:ca478e52ae1f88da0102360e57e528b92f3ae4316aabac80a2cd7f7ab2efb48a"}, + {file = "ipython-7.14.0-py3-none-any.whl", hash = "sha256:5b241b84bbf0eb085d43ae9d46adf38a13b45929ca7774a740990c2c242534bb"}, + {file = "ipython-7.14.0.tar.gz", hash = "sha256:f0126781d0f959da852fb3089e170ed807388e986a8dd4e6ac44855845b0fb1c"}, ] ipython-genutils = [ {file = "ipython_genutils-0.2.0-py2.py3-none-any.whl", hash = "sha256:72dd37233799e619666c9f639a9da83c34013a73e8bbc79a7a6348d93c61fab8"}, @@ -467,12 +467,12 @@ jetforce = [ {file = "Jetforce-0.2.2.tar.gz", hash = "sha256:2600103ba35590b3884b97c764ba34e7da991ac39a0b03621da4ae0760627060"}, ] more-itertools = [ - {file = "more-itertools-8.2.0.tar.gz", hash = "sha256:b1ddb932186d8a6ac451e1d95844b382f55e12686d51ca0c68b6f61f2ab7a507"}, - {file = "more_itertools-8.2.0-py3-none-any.whl", hash = "sha256:5dd8bcf33e5f9513ffa06d5ad33d78f31e1931ac9a18f33d37e77a180d393a7c"}, + {file = "more-itertools-8.3.0.tar.gz", hash = "sha256:558bb897a2232f5e4f8e2399089e35aecb746e1f9191b6584a151647e89267be"}, + {file = "more_itertools-8.3.0-py3-none-any.whl", hash = "sha256:7818f596b1e87be009031c7653d01acc46ed422e6656b394b0f765ce66ed4982"}, ] packaging = [ - {file = "packaging-20.3-py2.py3-none-any.whl", hash = "sha256:82f77b9bee21c1bafbf35a84905d604d5d1223801d639cf3ed140bd651c08752"}, - {file = "packaging-20.3.tar.gz", hash = "sha256:3c292b474fda1671ec57d46d739d072bfd495a4f51ad01a055121d81e952b7a3"}, + {file = "packaging-20.4-py2.py3-none-any.whl", hash = "sha256:998416ba6962ae7fbd6596850b80e17859a5753ba17c32284f67bfff33784181"}, + {file = "packaging-20.4.tar.gz", hash = "sha256:4357f74f47b9c12db93624a82154e9b120fa8293699949152b22065d556079f8"}, ] parso = [ {file = "parso-0.7.0-py2.py3-none-any.whl", hash = "sha256:158c140fc04112dc45bca311633ae5033c2c2a7b732fa33d0955bad8152a8dd0"}, @@ -515,40 +515,39 @@ pyparsing = [ {file = "pyparsing-2.4.7.tar.gz", hash = "sha256:c203ec8783bf771a155b207279b9bccb8dea02d8f0c9e5f8ead507bc3246ecc1"}, ] pytest = [ - {file = "pytest-5.4.1-py3-none-any.whl", hash = "sha256:0e5b30f5cb04e887b91b1ee519fa3d89049595f428c1db76e73bd7f17b09b172"}, - {file = "pytest-5.4.1.tar.gz", hash = "sha256:84dde37075b8805f3d1f392cc47e38a0e59518fb46a431cfdaf7cf1ce805f970"}, + {file = "pytest-5.4.2-py3-none-any.whl", hash = "sha256:95c710d0a72d91c13fae35dce195633c929c3792f54125919847fdcdf7caa0d3"}, + {file = "pytest-5.4.2.tar.gz", hash = "sha256:eb2b5e935f6a019317e455b6da83dd8650ac9ffd2ee73a7b657a30873d67a698"}, ] regex = [ - {file = "regex-2020.4.4-cp27-cp27m-win32.whl", hash = "sha256:90742c6ff121a9c5b261b9b215cb476eea97df98ea82037ec8ac95d1be7a034f"}, - {file = "regex-2020.4.4-cp27-cp27m-win_amd64.whl", hash = "sha256:24f4f4062eb16c5bbfff6a22312e8eab92c2c99c51a02e39b4eae54ce8255cd1"}, - {file = "regex-2020.4.4-cp36-cp36m-manylinux1_i686.whl", hash = "sha256:08119f707f0ebf2da60d2f24c2f39ca616277bb67ef6c92b72cbf90cbe3a556b"}, - {file = "regex-2020.4.4-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:c9423a150d3a4fc0f3f2aae897a59919acd293f4cb397429b120a5fcd96ea3db"}, - {file = "regex-2020.4.4-cp36-cp36m-manylinux2010_i686.whl", hash = "sha256:c087bff162158536387c53647411db09b6ee3f9603c334c90943e97b1052a156"}, - {file = "regex-2020.4.4-cp36-cp36m-manylinux2010_x86_64.whl", hash = "sha256:1cbe0fa0b7f673400eb29e9ef41d4f53638f65f9a2143854de6b1ce2899185c3"}, - {file = "regex-2020.4.4-cp36-cp36m-win32.whl", hash = "sha256:0ce9537396d8f556bcfc317c65b6a0705320701e5ce511f05fc04421ba05b8a8"}, - {file = "regex-2020.4.4-cp36-cp36m-win_amd64.whl", hash = "sha256:7e1037073b1b7053ee74c3c6c0ada80f3501ec29d5f46e42669378eae6d4405a"}, - {file = "regex-2020.4.4-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:4385f12aa289d79419fede43f979e372f527892ac44a541b5446617e4406c468"}, - {file = "regex-2020.4.4-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:a58dd45cb865be0ce1d5ecc4cfc85cd8c6867bea66733623e54bd95131f473b6"}, - {file = "regex-2020.4.4-cp37-cp37m-manylinux2010_i686.whl", hash = "sha256:ccccdd84912875e34c5ad2d06e1989d890d43af6c2242c6fcfa51556997af6cd"}, - {file = "regex-2020.4.4-cp37-cp37m-manylinux2010_x86_64.whl", hash = "sha256:ea4adf02d23b437684cd388d557bf76e3afa72f7fed5bbc013482cc00c816948"}, - {file = "regex-2020.4.4-cp37-cp37m-win32.whl", hash = "sha256:2294f8b70e058a2553cd009df003a20802ef75b3c629506be20687df0908177e"}, - {file = "regex-2020.4.4-cp37-cp37m-win_amd64.whl", hash = "sha256:e91ba11da11cf770f389e47c3f5c30473e6d85e06d7fd9dcba0017d2867aab4a"}, - {file = "regex-2020.4.4-cp38-cp38-manylinux1_i686.whl", hash = "sha256:5635cd1ed0a12b4c42cce18a8d2fb53ff13ff537f09de5fd791e97de27b6400e"}, - {file = "regex-2020.4.4-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:23069d9c07e115537f37270d1d5faea3e0bdded8279081c4d4d607a2ad393683"}, - {file = "regex-2020.4.4-cp38-cp38-manylinux2010_i686.whl", hash = "sha256:c162a21e0da33eb3d31a3ac17a51db5e634fc347f650d271f0305d96601dc15b"}, - {file = "regex-2020.4.4-cp38-cp38-manylinux2010_x86_64.whl", hash = "sha256:fb95debbd1a824b2c4376932f2216cc186912e389bdb0e27147778cf6acb3f89"}, - {file = "regex-2020.4.4-cp38-cp38-win32.whl", hash = "sha256:2a3bf8b48f8e37c3a40bb3f854bf0121c194e69a650b209628d951190b862de3"}, - {file = "regex-2020.4.4-cp38-cp38-win_amd64.whl", hash = "sha256:5bfed051dbff32fd8945eccca70f5e22b55e4148d2a8a45141a3b053d6455ae3"}, - {file = "regex-2020.4.4.tar.gz", hash = "sha256:295badf61a51add2d428a46b8580309c520d8b26e769868b922750cf3ce67142"}, + {file = "regex-2020.5.14-cp27-cp27m-win32.whl", hash = "sha256:e565569fc28e3ba3e475ec344d87ed3cd8ba2d575335359749298a0899fe122e"}, + {file = "regex-2020.5.14-cp27-cp27m-win_amd64.whl", hash = "sha256:d466967ac8e45244b9dfe302bbe5e3337f8dc4dec8d7d10f5e950d83b140d33a"}, + {file = "regex-2020.5.14-cp36-cp36m-manylinux1_i686.whl", hash = "sha256:27ff7325b297fb6e5ebb70d10437592433601c423f5acf86e5bc1ee2919b9561"}, + {file = "regex-2020.5.14-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:ea55b80eb0d1c3f1d8d784264a6764f931e172480a2f1868f2536444c5f01e01"}, + {file = "regex-2020.5.14-cp36-cp36m-manylinux2010_i686.whl", hash = "sha256:c9bce6e006fbe771a02bda468ec40ffccbf954803b470a0345ad39c603402577"}, + {file = "regex-2020.5.14-cp36-cp36m-manylinux2010_x86_64.whl", hash = "sha256:d881c2e657c51d89f02ae4c21d9adbef76b8325fe4d5cf0e9ad62f850f3a98fd"}, + {file = "regex-2020.5.14-cp36-cp36m-win32.whl", hash = "sha256:99568f00f7bf820c620f01721485cad230f3fb28f57d8fbf4a7967ec2e446994"}, + {file = "regex-2020.5.14-cp36-cp36m-win_amd64.whl", hash = "sha256:70c14743320a68c5dac7fc5a0f685be63bc2024b062fe2aaccc4acc3d01b14a1"}, + {file = "regex-2020.5.14-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:a7c37f048ec3920783abab99f8f4036561a174f1314302ccfa4e9ad31cb00eb4"}, + {file = "regex-2020.5.14-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:89d76ce33d3266173f5be80bd4efcbd5196cafc34100fdab814f9b228dee0fa4"}, + {file = "regex-2020.5.14-cp37-cp37m-manylinux2010_i686.whl", hash = "sha256:51f17abbe973c7673a61863516bdc9c0ef467407a940f39501e786a07406699c"}, + {file = "regex-2020.5.14-cp37-cp37m-manylinux2010_x86_64.whl", hash = "sha256:ce5cc53aa9fbbf6712e92c7cf268274eaff30f6bd12a0754e8133d85a8fb0f5f"}, + {file = "regex-2020.5.14-cp37-cp37m-win32.whl", hash = "sha256:8044d1c085d49673aadb3d7dc20ef5cb5b030c7a4fa253a593dda2eab3059929"}, + {file = "regex-2020.5.14-cp37-cp37m-win_amd64.whl", hash = "sha256:c2062c7d470751b648f1cacc3f54460aebfc261285f14bc6da49c6943bd48bdd"}, + {file = "regex-2020.5.14-cp38-cp38-manylinux1_i686.whl", hash = "sha256:329ba35d711e3428db6b45a53b1b13a0a8ba07cbbcf10bbed291a7da45f106c3"}, + {file = "regex-2020.5.14-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:579ea215c81d18da550b62ff97ee187b99f1b135fd894a13451e00986a080cad"}, + {file = "regex-2020.5.14-cp38-cp38-manylinux2010_i686.whl", hash = "sha256:3a9394197664e35566242686d84dfd264c07b20f93514e2e09d3c2b3ffdf78fe"}, + {file = "regex-2020.5.14-cp38-cp38-manylinux2010_x86_64.whl", hash = "sha256:ce367d21f33e23a84fb83a641b3834dd7dd8e9318ad8ff677fbfae5915a239f7"}, + {file = "regex-2020.5.14-cp38-cp38-win32.whl", hash = "sha256:1386e75c9d1574f6aa2e4eb5355374c8e55f9aac97e224a8a5a6abded0f9c927"}, + {file = "regex-2020.5.14-cp38-cp38-win_amd64.whl", hash = "sha256:7e61be8a2900897803c293247ef87366d5df86bf701083b6c43119c7c6c99108"}, + {file = "regex-2020.5.14.tar.gz", hash = "sha256:ce450ffbfec93821ab1fea94779a8440e10cf63819be6e176eb1973a6017aff5"}, ] six = [ {file = "six-1.14.0-py2.py3-none-any.whl", hash = "sha256:8f3cd2e254d8f793e7f3d6d9df77b92252b52637291d0f0da013c76ea2724b6c"}, {file = "six-1.14.0.tar.gz", hash = "sha256:236bdbdce46e6e6a3d61a337c0f8b763ca1e8717c03b369e87a7ec7ce1319c0a"}, ] toml = [ - {file = "toml-0.10.0-py2.7.egg", hash = "sha256:f1db651f9657708513243e61e6cc67d101a39bad662eaa9b5546f789338e07a3"}, - {file = "toml-0.10.0-py2.py3-none-any.whl", hash = "sha256:235682dd292d5899d361a811df37e04a8828a5b1da3115886b73cf81ebc9100e"}, - {file = "toml-0.10.0.tar.gz", hash = "sha256:229f81c57791a41d65e399fc06bf0848bab550a9dfd5ed66df18ce5f05e73d5c"}, + {file = "toml-0.10.1-py2.py3-none-any.whl", hash = "sha256:bda89d5935c2eac546d648028b9901107a595863cb36bae0c73ac804a9b4ce88"}, + {file = "toml-0.10.1.tar.gz", hash = "sha256:926b612be1e5ce0634a2ca03470f95169cf16f939018233a670519cb4ac58b0f"}, ] traitlets = [ {file = "traitlets-4.3.3-py2.py3-none-any.whl", hash = "sha256:70b4c6a1d9019d7b4f6846832288f86998aa3b9207c6821f3578a6a6a467fe44"},