N元马尔科夫链的实现

马尔可夫模型（Markov Model）是一种统计模型，广泛应用在语音识别，词性自动标注，音字转换，概率文法等各个自然语言处理等应用领域。经过长期发展，尤其是在语音识别中的成功应用，使它成为一种通用的统计工具。

以下利用一篇英文演讲来实现简单得文字生成器，结果其实是一个胡言乱语得结果，但我们可以通过这个类型粗略理解机器学习，首先英文演讲文章的链接：http://pythonscraping.com/files/inaugurationSpeech.txt

以及我上一篇处理该文章的博客链接：http://www.cnblogs.com/ybf-yyj/p/7399149.html

以下以生成100个单词的马尔科夫链为例：

#-*- coding:utf-8 -*-

from urllib2 import urlopen

from random import randint

#统计所有单词出现的次数总和

def wordListSum(wordList):

    sum=0

    for word,value in wordList.items():

        print word,value

        sum+=value

    return sum

def retrieveRandomWord(wordList):

    #在1到所有单词出现总和之间选着一个数字，保证每次输出句子不一样

    randIndex=randint(1,wordListSum(wordList))

    #通过randIndex随机选择一个字母返回

    for word,value in wordList.items():

        randIndex-=value

        if randIndex<=0:

            return word

def buildWordDic(text):

    #清洗\n和 "

    text=text.replace("\n"," ")

    text=text.replace("\"","")

    #保证标点符号和前面的单词在一起，不被剔除

    punctuation=[',','.',';',':']

    for symbol in punctuation:

        text=text.replace(symbol," "+symbol+" ")

    #切割文章

    words=text.split(" ")

    #除去空单词

    words=[word for word in words if word !=""]

    #定义一个总词典

    wordDict={}

    for i in range(1,len(words)):

        #为新单词再创一个新词典

        # 比如句子为：How do you do.

        if words[i-1] not in wordDict:

            #结果应该为：{'How':{},'do':{}}

            wordDict[words[i-1]]={}

        #将下一个单词加入前一个单词的词典中

        if words[i] not in wordDict[words[i - 1]]:

            # 结果应该为：{'How':{'do':0},'do':{'you':0,'.':0}}

            wordDict[words[i-1]][words[i]]=0

        # 结果应该为：{'How':{'do':1},'do':{'you':1,'.':1}}

        wordDict[words[i - 1]][words[i]]=wordDict[words[i-1]][words[i]]+1

    return wordDict

text=str(urlopen('http://pythonscraping.com/files/inaugurationSpeech.txt').read().decode('utf-8'))

wordDict=buildWordDic(text)

length=100

chain=''

#随便选择一个单词开头

currentword='I'

for i in range(0,length):

    chain +=currentword+' '

    currentword=str(retrieveRandomWord(wordDict[currentword]))

print(chain)