From ba043ef4f2c713662f89425aed70dfd78e3955ee Mon Sep 17 00:00:00 2001 From: Patrick Welche Date: Mon, 23 Oct 2017 13:59:58 +0100 Subject: [PATCH] glib-mkenums: best effort attempt on non-utf8 encoded files. Some source files aren't valid utf-8 containing for example iso8859-1 accented characters in author's names. Replace invalid data with a replacement '?' character and print a warning to keep things working. Based on a patch from Christoph Reiter in https://bugzilla.gnome.org/show_bug.cgi?id=785113#c20 Upstream-Status: Submitted [https://bug785113.bugzilla-attachments.gnome.org/attachment.cgi?id=362098] Author: Patrick Welche Signed-off-by: Jackie Huang --- gobject/glib-mkenums.in | 41 ++++++++++++++++++++++++++++++----------- 1 file changed, 30 insertions(+), 11 deletions(-) diff --git a/gobject/glib-mkenums.in b/gobject/glib-mkenums.in index 7cc55053c..9790a65a2 100755 --- a/gobject/glib-mkenums.in +++ b/gobject/glib-mkenums.in @@ -26,14 +26,6 @@ the GNU General Public License which can be found in the GLib source package. Sources, examples and contact information are available at http://www.gtk.org''' -# Python 2 defaults to ASCII in case stdout is redirected. -# This should make it match Python 3, which uses the locale encoding. -if sys.stdout.encoding is None: - output_stream = codecs.getwriter( - locale.getpreferredencoding())(sys.stdout) -else: - output_stream = sys.stdout - # pylint: disable=too-few-public-methods class Color: '''ANSI Terminal colors''' @@ -81,6 +73,31 @@ def write_output(output): global output_stream print(output, file=output_stream) + +# Python 2 defaults to ASCII in case stdout is redirected. +# This should make it match Python 3, which uses the locale encoding. +if sys.stdout.encoding is None: + output_stream = codecs.getwriter( + locale.getpreferredencoding())(sys.stdout) +else: + output_stream = sys.stdout + + +# Some source files aren't utf-8 and the old perl version didn't care. +# Replace invalid data with a replacement character to keep things working. +# https://bugzilla.gnome.org/show_bug.cgi?id=785113#c20 +decoding_errors = "replace_and_warn" + +def replace_and_warn(err): + # 7 characters of context either side of the offending character + print_warning('UnicodeWarning: {} at {} ({})'.format( + err.reason, err.start, + err.object[err.start - 7:err.end + 7])) + return ('?', err.end) + +codecs.register_error('replace_and_warn', replace_and_warn) + + # glib-mkenums.py # Information about the current enumeration flags = None # Is enumeration a bitmask? @@ -157,7 +174,8 @@ def parse_entries(file, file_name): m = re.match(r'\#include\s*<([^>]*)>', line) if m: newfilename = os.path.join("..", m.group(1)) - newfile = io.open(newfilename, encoding="utf-8") + newfile = io.open(newfilename, encoding="utf-8", + errors=decoding_errors) if not parse_entries(newfile, newfilename): return False @@ -253,7 +271,7 @@ def read_template_file(file): } in_ = 'junk' - ifile = io.open(file, encoding="utf-8") + ifile = io.open(file, encoding="utf-8", errors=decoding_errors) for line in ifile: m = re.match(r'\/\*\*\*\s+(BEGIN|END)\s+([\w-]+)\s+\*\*\*\/', line) if m: @@ -408,7 +426,8 @@ def process_file(curfilename): firstenum = True try: - curfile = io.open(curfilename, encoding="utf-8") + curfile = io.open(curfilename, encoding="utf-8", + errors=decoding_errors) except IOError as e: if e.errno == errno.ENOENT: print_warning('No file "{}" found.'.format(curfilename)) -- 2.14.2