我正在尝试将mbox转换为适合导入MongoDB的JSON结构,即
我正在使用挖掘社交网络第二版邮箱章节,但无法正常工作。
我正在尝试将mbox转换为适合导入MongoDB的JSON结构,即
我正在使用挖掘社交网络第二版邮箱章节,但无法正常工作。

 import sys
 import mailbox
 import email
 import quopri
 import json
 import time
 from BeautifulSoup import BeautifulSoup
 from dateutil.parser import parse

 MBOX = 'resources/ch06-mailboxes/data/enron.mbox'
 OUT_FILE = MBOX + '.json'

 def cleanContent(msg):

 # Decode message from "quoted printable" format, but first
 # re-encode, since decodestring will try to do a decode of its own
 msg = quopri.decodestring(msg.encode('utf-8'))

 # Strip out HTML tags, if any are present.
 # Bail on unknown encodings if errors happen in BeautifulSoup.
 try:
    soup = BeautifulSoup(msg)
 except:
    return ''
 return ''.join(soup.findAll(text=True))

  # There's a lot of data to process, and the Pythonic way to do it is with a
  # generator. See http://wiki.python.org/moin/Generators.
  # Using a generator requires a trivial encoder to be passed to json for object
  # serialization.

 class Encoder(json.JSONEncoder):
 def default(self, o): return  list(o)
 # The generator itself...
 def gen_json_msgs(mb):
    while 1:
    msg = mb.next()
    if msg is None:
        break

    yield jsonifyMessage(msg)

  def jsonifyMessage(msg):
  json_msg = {'parts': []}
  for (k, v) in msg.items():
    json_msg[k] = v.decode('utf-8', 'ignore')

  # The To, Cc, and Bcc fields, if present, could have multiple items.
  # Note that not all of these fields are necessarily defined.

  for k in ['To', 'Cc', 'Bcc']:
    if not json_msg.get(k):
        continue
    json_msg[k] = json_msg[k].replace('\n', '').replace('\t', '').replace('\r', '')\
                             .replace(' ', '').decode('utf-8', 'ignore').split(',')

  for part in msg.walk():
     json_part = {}

    if part.get_content_maintype() != 'text':
        print >> sys.stderr, "Skipping MIME content in JSONification
    ({0})".format(part.get_content_maintype())
        continue

    json_part['contentType'] = part.get_content_type()
    content = part.get_payload(decode=False).decode('utf-8', 'ignore')
    json_part['content'] = cleanContent(content)
    json_msg['parts'].append(json_part)

# Finally, convert date from asctime to milliseconds since epoch using the
# $date descriptor so it imports "natively" as an ISODate object in MongoDB
then = parse(json_msg['Date'])
millis = int(time.mktime(then.timetuple())*1000 + then.microsecond/1000)
json_msg['Date'] = {'$date' : millis}

return json_msg

mbox = mailbox.UnixMailbox(open(MBOX, 'rb'), email.message_from_file)

# Write each message out as a JSON object on a separate line
# for easy import into MongoDB via mongoimport

f = open(OUT_FILE, 'w')
for msg in gen_json_msgs(mbox):
 if msg != None:
    f.write(json.dumps(msg, cls=Encoder) + '\n')
f.close()

print "All done"

getting error:
80 # for easy import into MongoDB via mongoimport
  81
  ---> 82 f = open(OUT_FILE, 'w')
  83 for msg in gen_json_msgs(mbox):
  84     if msg != None:

IOError: [Errno 13] Permission denied: 'resources/ch06-mailboxes/data/enron.mbox.json'

最佳答案

看来您的问题与用户权限有关,而不是Python。第82行尝试打开“数据”文件夹中的文件,但权限被拒绝。您应该尝试从终端使用sudo命令执行脚本:

sudo python3 <your script name>

这应该解决您指出的错误。

PS:Python 3使用print作为函数;第88行应显示为
print('All done')

关于python-3.x - 如何将mbox转换为JSON结构?,我们在Stack Overflow上找到一个类似的问题:https://stackoverflow.com/questions/22006616/

10-12 16:37