我正在研究的文本云程序需要帮助。我意识到这是家庭作业,但是我已经走了很长一段路,直到现在被困了几个小时。我被困在网络爬虫部分。该程序应该打开一个页面,收集该页面中的所有单词,然后按频率对其进行排序。然后应该打开该页面上的所有链接并获取该页面上的单词,依此类推。深度由全局变量DEPTH控制。最后,应该将所有页面中的所有单词放在一起形成一个文本云。
我正在尝试使用递归调用函数来保持打开链接,直到达到深度为止。顶部的import语句仅使用名为getHTML(URL)的函数,该函数返回页面上单词列表以及页面上任何链接的元组。
到目前为止,这是我的代码。除getRecursiveURLs(url,DEPTH)和makeWords(i)之外,每个函数均应按预期工作。我也不是100%知道底部的counter(List)函数。
from hmc_urllib import getHTML
MAXWORDS = 50
DEPTH = 2
all_links = []
def getURL():
"""Asks the user for a URL"""
URL = input('Please enter a URL: ')
#all_links.append(URL)
return makeListOfWords(URL), getRecursiveURLs(URL, DEPTH)
def getRecursiveURLs(url, DEPTH):
"""Opens up all links and adds them to global all_links list,
if they're not in all_links already"""
s = getHTML(url)
links = s[1]
if DEPTH > 0:
for i in links:
getRecursiveURLs(i, DEPTH - 1)
if i not in all_links:
all_links.append(i)
#print('This is all_links in the IF', all_links)
makeWords(i)#getRecursiveURLs(i, DEPTH - 1)
#elif i in all_links:
# print('This is all_links in the ELIF', all_links)
# makeWords(i) #getRecursiveURLs(i, DEPTH - 1)
#print('All_links at the end', all_links)
return all_links
def makeWords(i):
"""Take all_links and create a dictionary for each page.
Then, create a final dictionary of all the words on all pages."""
for i in all_links:
FinalDict = makeListOfWords(i)
#print(all_links)
#makeListOfWords(i))
return FinalDict
def makeListOfWords(URL):
"""Gets the text from a webpage and puts the words into a list"""
text = getHTML(str(URL))
L = text[0].split()
return cleaner(L)
def cleaner(L):
"""Cleans the text of punctuation and removes words if they are in the stop list."""
stopList = ['', 'a', 'i', 'the', 'and', 'an', 'in', 'with', 'for',
'it', 'am', 'at', 'on', 'of', 'to', 'is', 'so', 'too',
'my', 'but', 'are', 'very', 'here', 'even', 'from',
'them', 'then', 'than', 'this', 'that', 'though']
x = [dePunc(c) for c in L]
for c in x:
if c in stopList:
x.remove(c)
a = [stemmer(c) for c in x]
return counter(a)
def dePunc( rawword ):
""" de-punctuationifies the input string """
L = [ c for c in rawword if 'A' <= c <= 'Z' or 'a' <= c <= 'z' ]
word = ''.join(L)
return word
def stemmer(word):
"""Stems the words"""
# List of endings
endings = ['ed', 'es', 's', 'ly', 'ing', 'er', 'ers']
# This first case handles 3 letter suffixes WITH a doubled consonant. I.E. spammers -> spam
if word[len(word)-3:len(word)] in endings and word[-4] == word[-5]:
return word[0:len(word)-4]
# This case handles 3 letter suffixes WITHOUT a doubled consonant. I.E. players -> play
elif word[len(word)-3:len(word)] in endings and word[-4] != word[-5]:
return word[0:len(word)-3]
# This case handles 2 letter suffixes WITH a doubled consonant. I.E. spammed -> spam
elif word[len(word)-2:len(word)] in endings and word[-3] == word[-4]:
return word[0:len(word)-3]
# This case handles 2 letter suffixes WITHOUT a doubled consonant. I.E. played -> played
elif word[len(word)-2:len(word)] in endings and word[-3] != word[-4]:
return word[0:len(word)-3]
# If word not inflected, return as-is.
else:
return word
def counter(List):
"""Creates dictionary of words and their frequencies, 'sorts' them,
and prints them from most least frequent"""
freq = {}
result = {}
# Assign frequency to each word
for item in List:
freq[item] = freq.get(item,0) + 1
# 'Sort' the dictionary by frequency
for i in sorted(freq, key=freq.get, reverse=True):
if len(result) < MAXWORDS:
print(i, '(', freq[i], ')', sep='')
result[i] = freq[i]
return result
最佳答案
目前尚不清楚任务的确切要求,但是据我所知,您希望访问一次直到DEPTH的所有页面。另外,您希望从所有页面上删除所有单词并使用汇总结果。下面的代码段是您要寻找的内容,但是未经测试(我没有hmc_urllib)。 all_links
,makeWords
和makeListOfWords
已被删除,但其余代码相同。
visited_links = []
def getURL():
url = input('Please enter a URL: ')
word_list = getRecursiveURLs(url, DEPTH)
return cleaner(word_list) # this prints the word count for all pages
def getRecursiveURLs(url, DEPTH):
text, links = getHTML(url)
visited_links.append(url)
returned_word_list = text.split()
#cleaner(text.split()) # this prints the word count for the current page
if DEPTH > 0:
for link in links:
if link not in visited_links:
returned_word_list += getRecursiveURLs(link, DEPTH - 1)
return returned_word_list
获得清除和词干的单词列表后,可以使用以下函数来生成单词计数字典并分别打印单词计数字典:
def counter(words):
"""
Example Input: ['spam', 'egg', 'egg', 'egg', 'spam', 'spam', 'egg', 'egg']
Example Output: {'spam': 3, 'egg', 5}
"""
return dict((word, x.count(word)) for word in set(words))
def print_count(word_count, word_max):
"""
Example Input: {'spam': 3, 'egg', 5}
Prints the word list up to the word_max sorted by frequency
"""
for word in sorted(word_count, key=word_count.get, reverse=True)[:word_max]:
print(word,'(', word_count[word], ')', sep= '')