我正在尝试将mbox转换为适合导入MongoDB的JSON结构,即
我正在使用挖掘社交网络第二版邮箱章节,但无法正常工作。
我正在尝试将mbox转换为适合导入MongoDB的JSON结构,即
我正在使用挖掘社交网络第二版邮箱章节,但无法正常工作。
import sys
import mailbox
import email
import quopri
import json
import time
from BeautifulSoup import BeautifulSoup
from dateutil.parser import parse
MBOX = 'resources/ch06-mailboxes/data/enron.mbox'
OUT_FILE = MBOX + '.json'
def cleanContent(msg):
# Decode message from "quoted printable" format, but first
# re-encode, since decodestring will try to do a decode of its own
msg = quopri.decodestring(msg.encode('utf-8'))
# Strip out HTML tags, if any are present.
# Bail on unknown encodings if errors happen in BeautifulSoup.
try:
soup = BeautifulSoup(msg)
except:
return ''
return ''.join(soup.findAll(text=True))
# There's a lot of data to process, and the Pythonic way to do it is with a
# generator. See http://wiki.python.org/moin/Generators.
# Using a generator requires a trivial encoder to be passed to json for object
# serialization.
class Encoder(json.JSONEncoder):
def default(self, o): return list(o)
# The generator itself...
def gen_json_msgs(mb):
while 1:
msg = mb.next()
if msg is None:
break
yield jsonifyMessage(msg)
def jsonifyMessage(msg):
json_msg = {'parts': []}
for (k, v) in msg.items():
json_msg[k] = v.decode('utf-8', 'ignore')
# The To, Cc, and Bcc fields, if present, could have multiple items.
# Note that not all of these fields are necessarily defined.
for k in ['To', 'Cc', 'Bcc']:
if not json_msg.get(k):
continue
json_msg[k] = json_msg[k].replace('\n', '').replace('\t', '').replace('\r', '')\
.replace(' ', '').decode('utf-8', 'ignore').split(',')
for part in msg.walk():
json_part = {}
if part.get_content_maintype() != 'text':
print >> sys.stderr, "Skipping MIME content in JSONification
({0})".format(part.get_content_maintype())
continue
json_part['contentType'] = part.get_content_type()
content = part.get_payload(decode=False).decode('utf-8', 'ignore')
json_part['content'] = cleanContent(content)
json_msg['parts'].append(json_part)
# Finally, convert date from asctime to milliseconds since epoch using the
# $date descriptor so it imports "natively" as an ISODate object in MongoDB
then = parse(json_msg['Date'])
millis = int(time.mktime(then.timetuple())*1000 + then.microsecond/1000)
json_msg['Date'] = {'$date' : millis}
return json_msg
mbox = mailbox.UnixMailbox(open(MBOX, 'rb'), email.message_from_file)
# Write each message out as a JSON object on a separate line
# for easy import into MongoDB via mongoimport
f = open(OUT_FILE, 'w')
for msg in gen_json_msgs(mbox):
if msg != None:
f.write(json.dumps(msg, cls=Encoder) + '\n')
f.close()
print "All done"
getting error:
80 # for easy import into MongoDB via mongoimport
81
---> 82 f = open(OUT_FILE, 'w')
83 for msg in gen_json_msgs(mbox):
84 if msg != None:
IOError: [Errno 13] Permission denied: 'resources/ch06-mailboxes/data/enron.mbox.json'
最佳答案
看来您的问题与用户权限有关,而不是Python。第82行尝试打开“数据”文件夹中的文件,但权限被拒绝。您应该尝试从终端使用sudo
命令执行脚本:
sudo python3 <your script name>
这应该解决您指出的错误。
PS:Python 3使用
print
作为函数;第88行应显示为print('All done')
关于python-3.x - 如何将mbox转换为JSON结构?,我们在Stack Overflow上找到一个类似的问题:https://stackoverflow.com/questions/22006616/