commit e779a45a3564b2fa475280b2c51eec33cb7b2e84
parent 50be14a327be8ab79b9b61bef1768935a6485357
Author: Solderpunk <solderpunk@sdf.org>
Date: Sat, 22 Jun 2019 16:29:14 +0300
Use text encoding declared in the response header, and treat decoding errors as fatal.
Diffstat:
M | av98.py | | | 66 | ++++++++++++++---------------------------------------------------- |
1 file changed, 14 insertions(+), 52 deletions(-)
diff --git a/av98.py b/av98.py
@@ -10,6 +10,7 @@
import argparse
import cmd
+import cgi
import codecs
import collections
import fnmatch
@@ -27,13 +28,6 @@ import urllib.parse
import ssl
import time
-# Use chardet if it's there, but don't depend on it
-try:
- import chardet
- _HAS_CHARDET = True
-except ImportError:
- _HAS_CHARDET = False
-
# Command abbreviations
_ABBREVS = {
"a": "add",
@@ -237,13 +231,13 @@ class GeminiClient(cmd.Cmd):
self._debug("Response header: %s." % header)
body = f.read()
status, mime = header.split("\t")
-
-# except UnicodeError:
-# print("""ERROR: Unknown text encoding!
-#If you know the correct encoding, use e.g. 'set encoding koi8-r' and
-#try again. Otherwise, install the 'chardet' library for Python 3 to
-#enable automatic encoding detection.""")
-# return
+ mime, mime_options = cgi.parse_header(mime)
+ if "charset" in mime_options:
+ try:
+ codecs.lookup(mime_options["charset"])
+ except LookupError:
+ print("Header declared unknown encoding %s" % value)
+ return
# Catch network errors which may be recoverable if a redundant
# mirror is specified
@@ -294,8 +288,12 @@ Slow internet connection? Use 'set timeout' to be more patient.""")
## Set file mode
if mime.startswith("text/"):
mode = "w"
- encoding = "UTF-8"
- body = body.decode(encoding)
+ encoding = mime_options.get("charset", "UTF-8")
+ try:
+ body = body.decode(encoding)
+ except UnicodeError:
+ print("Could not decode response body using %s encoding declared in header!" % encoding)
+ return
else:
mode = "wb"
encoding = None
@@ -390,42 +388,6 @@ Slow internet connection? Use 'set timeout' to be more patient.""")
self._debug("Using handler: %s" % cmd_str)
return cmd_str
- def _decode_text(self, f):
- # Attempt to decode some bytes into a Unicode string.
- # First of all, try UTF-8 as the default.
- # If this fails, attempt to autodetect the encoding if chardet
- # library is installed.
- # If chardet is not installed, or fails to work, fall back on
- # the user-specified alternate encoding.
- # If none of this works, this will raise UnicodeError and it's
- # up to the caller to handle it gracefully.
- raw_bytes = f.read()
- # Try UTF-8 first:
- try:
- text = raw_bytes.decode("UTF-8")
- except UnicodeError:
- # If we have chardet, try the magic
- self._debug("Could not decode response as UTF-8.")
- if _HAS_CHARDET:
- autodetect = chardet.detect(raw_bytes)
- # Make sure we're vaguely certain
- if autodetect["confidence"] > 0.5:
- self._debug("Trying encoding %s as recommended by chardet." % autodetect["encoding"])
- text = raw_bytes.decode(autodetect["encoding"])
- else:
- # Try the user-specified encoding
- self._debug("Trying fallback encoding %s." % self.options["encoding"])
- text = raw_bytes.decode(self.options["encoding"])
- else:
- # Try the user-specified encoding
- text = raw_bytes.decode(self.options["encoding"])
- if not text.endswith("\n"):
- text += CRLF
- new_f = io.StringIO()
- new_f.write(text)
- new_f.seek(0)
- return new_f
-
def _handle_index(self, body, menu_gi):
self.index = []
if self.idx_filename: