python - Python中带有文件的Unicode解码错误

所以我在解码方面遇到了麻烦。我在其他线程中发现了如何使用u'string'.encode对简单字符串进行处理。但是我找不到使它与文件一起使用的方法。

任何帮助，将不胜感激！

这是代码。

text = file.read()
text.replace(txt.encode('utf-8'), novo_txt.encode('utf-8'))
file.seek(0)  # rewind
file.write(text.encode('utf-8'))

这是整个代码，是否有帮助。

#!/usr/bin/env python
# coding: utf-8

"""
 Script to helps on translate some code's methods from
 portuguese to english.
"""

from multiprocessing import Pool
from mock import MagicMock
from goslate import Goslate
import fnmatch
import logging
import os
import re
import urllib2

_MAX_PEERS = 1
try:
    os.remove('traducoes.log')
except OSError:
    pass
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
handler = logging.FileHandler('traducoes.log')
logger.addHandler(handler)


def fileWalker(ext, dirname, names):
    """
    Find the files with the correct extension
    """
    pat = "*" + ext[0]
    for f in names:
        if fnmatch.fnmatch(f, pat):
            ext[1].append(os.path.join(dirname, f))


def encontre_text(file):
    """
    find on the string the works wich have '_' on it
    """
    text = file.read().decode('utf-8')
    return re.findall(r"\w+(?<=_)\w+", text)
    #return re.findall(r"\"\w+\"", text)


def traduza_palavra(txt):
    """
        Translate the word/phrase to english
    """
    try:
        # try connect with google
        response = urllib2.urlopen('http://google.com', timeout=2)
        pass
    except urllib2.URLError as err:
        print "No network connection "
        exit(-1)
    if txt[0] != '_':
        txt = txt.replace('_', ' ')
    txt = txt.replace('media'.decode('utf-8'), 'média'.decode('utf-8'))
    gs = Goslate()
    #txt = gs.translate(txt, 'en', gs.detect(txt))
    txt = gs.translate(txt, 'en', 'pt-br')  # garantindo idioma tupiniquim
    txt = txt.replace(' en ', ' br ')
    return txt.replace(' ', '_')  # .lower()


def subistitua(file, txt, novo_txt):
    """
    should rewrite the file with the new text in the future
    """
    text = file.read()
    text.replace(txt.encode('utf-8'), novo_txt.encode('utf-8'))
    file.seek(0)  # rewind
    file.write(text.encode('utf-8'))


def magica(File):
    """
    Thread Pool. Every single thread should play around here with
    one element from list os files
    """
    global _DONE
    if _MAX_PEERS == 1:  # inviavel em multithread
        logger.info('\n---- File %s' % File)
    with open(File, "r+") as file:
        list_txt = encontre_text(file)
        for txt in list_txt:
            novo_txt = traduza_palavra(txt)
            if txt != novo_txt:
                logger.info('%s -> %s [%s]' % (txt, novo_txt, File))
            subistitua(file, txt, novo_txt)
        file.close()
    print File.ljust(70) + '[OK]'.rjust(5)

if __name__ == '__main__':
    try:
        response = urllib2.urlopen('http://www.google.com.br', timeout=1)
    except urllib2.URLError as err:
        print "No network connection "
        exit(-1)
    root = './app'
    ex = ".py"
    files = []
    os.path.walk(root, fileWalker, [ex, files])

    print '%d files found to be translated' % len(files)
    try:
        if _MAX_PEERS > 1:
            _pool = Pool(processes=_MAX_PEERS)
            result = _pool.map_async(magica, files)
            result.wait()
        else:
            result = MagicMock()
            result.successful.return_value = False
            for f in files:
                pass
                magica(f)
            result.successful.return_value = True
    except AssertionError, e:
        print e
    else:
        pass
    finally:
        if result.successful():
            print 'Translated all files'
        else:
            print 'Some files were not translated'

谢谢大家的帮助！

最佳答案

在Python 2中，从文件读取会生成常规（字节）字符串对象，而不是unicode对象。无需在这些文件上调用.encode()；实际上，这只会首先触发对Unicode的自动解码，这可能会失败。

经验法则：使用unicode三明治。每当您读取数据时，都将在该阶段解码为unicode。在整个代码中使用unicode值。每当您写入数据时，都要在那一点编码。您可以使用io.open()打开自动为您编码和解码的文件对象。

这也意味着您可以在任何地方使用unicode文字。用于您的正则表达式，用于字符串文字。因此使用：

def encontre_text(file):
    text = file.read()  # assume `io.open()` was used
    return re.findall(ur"\w+(?<=_)\w+", text)  # use a unicode pattern

和

def subistitua(file, txt, novo_txt):
    text = file.read()  # assume `io.open()` was used
    text = text.replace(txt, novo_txt)
    file.seek(0)  # rewind
    file.write(text)

因为程序中的所有字符串值都已经是unicode，并且

txt = txt.replace(u'media', u'média')

因为u'..' unicode字符串文字不再需要解码。