我正在尝试从测试准备站点获取几个不同测试的数据。有不同的科目,每个科目都有一个专业,每个科目都有一个练习测试,每个科目都有几个问题。
subject <--- specialization <---- practice-test *------ question
这是我的代码:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pathlib
import time
import json
import os
driver=webdriver.Firefox(executable_path="../../../geckodriver.exe")
wait = WebDriverWait(driver, 15)
data=[]
def setup():
driver.get('https://www.varsitytutors.com/practice-tests')
try:
go_away_1= driver.find_element_by_class_name("ub-emb-iframe")
driver.execute_script("arguments[0].style.visibility='hidden'", go_away_1)
go_away_2= driver.find_element_by_class_name("ub-emb-iframe-wrapper")
driver.execute_script("arguments[0].style.visibility='hidden'", go_away_2)
go_away_3= driver.find_element_by_class_name("ub-emb-visible")
driver.execute_script("arguments[0].style.visibility='hidden'", go_away_3)
except:
pass
def get_subjects(subs=[]):
subject_clickables_xpath="/html/body/div[3]/div[9]/div/*/div[@data-subject]/div[1]"
subject_clickables=driver.find_elements_by_xpath(subject_clickables_xpath)
subject_names=map(lambda x : x.find_element_by_xpath('..').get_attribute('data-subject'), subject_clickables)
subject_pairs=zip(subject_names, subject_clickables)
return subject_pairs
def get_specializations(subject):
specialization_clickables_xpath="//div//div[@data-subject='"+subject+"']/following-sibling::div//div[@class='public_problem_set']//a[contains(.,'Practice Tests')]"
specialization_names_xpath="//div//div[@data-subject='"+subject+"']/following-sibling::div//div[@class='public_problem_set']//a[contains(.,'Practice Tests')]/../.."
specialization_names=map(lambda x : x.get_attribute('data-subject'), driver.find_elements_by_xpath(specialization_names_xpath))
specialization_clickables = driver.find_elements_by_xpath(specialization_clickables_xpath)
specialization_pairs=zip(specialization_names, specialization_clickables)
return specialization_pairs
def get_practices(subject, specialization):
practice_clickables_xpath="/html/body/div[3]/div[8]/div[3]/*/div[1]/a[1]"
practice_names_xpath="//*/h3[@class='subject_header']"
lengths_xpath="/html/body/div[3]/div[8]/div[3]/*/div[2]"
lengths=map(lambda x : x.text, driver.find_elements_by_xpath(lengths_xpath))
print(lengths)
practice_names=map(lambda x : x.text, driver.find_elements_by_xpath(practice_names_xpath))
practice_clickables = driver.find_elements_by_xpath(practice_clickables_xpath)
practice_pairs=zip(practice_names, practice_clickables)
return practice_pairs
def remove_popup():
try:
button=wait.until(EC.element_to_be_clickable((By.XPATH,"//button[contains(.,'No Thanks')]")))
button.location_once_scrolled_into_view
button.click()
except:
print('could not find the popup')
def get_questions(subject, specialization, practice):
remove_popup()
questions=[]
current_question=None
while True:
question={}
try:
WebDriverWait(driver,5).until(EC.presence_of_element_located((By.XPATH,"/html/body/div[3]/div[7]/div[1]/div[2]/div[2]/table/tbody/tr/td[1]")))
question_number=driver.find_element_by_xpath('/html/body/div[3]/div[7]/div[1]/div[2]/div[2]/table/tbody/tr/td[1]').text.replace('.','')
question_pre=driver.find_element_by_class_name('question_pre')
question_body=driver.find_element_by_xpath('/html/body/div[3]/div[7]/div[1]/div[2]/div[2]/table/tbody/tr/td[2]/p')
answer_choices=driver.find_elements_by_class_name('question_row')
answers=map(lambda x : x.text, answer_choices)
question['id']=question_number
question['pre']=question_pre.text
question['body']=question_body.text
question['answers']=list(answers)
questions.append(question)
choice=WebDriverWait(driver,20).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR,"input.test_button")))
driver.execute_script("arguments[0].click();", choice[3])
time.sleep(3)
except Exception as e:
if 'results' in driver.current_url:
driver.get(driver.current_url.replace('http://', 'https://'))
# last question has been answered; record results
remove_popup()
pathlib.Path('data/'+subject+'/'+specialization).mkdir(parents=True, exist_ok=True)
with open('data/'+subject+'/'+specialization+'/questions.json', 'w') as outfile:
json.dump(list(questions), outfile)
break
else:
driver.get(driver.current_url.replace('http://', 'https://'))
return questions
def scrape():
setup()
subjects=get_subjects()
for subject_name, subject_clickable in subjects:
subject={}
subject['name']=subject_name
subject['specializations']=[]
subject_clickable.click()
subject_url=driver.current_url.replace('http://', 'https://')
specializations=get_specializations(subject_name)
for specialization_name, specialization_clickable in specializations:
specialization={}
specialization['name']=specialization_name
specialization['practices']=[]
specialization_clickable.click()
specialization_url=driver.current_url.replace('http://', 'https://')
practices=get_practices(subject_name, specialization_name)
for practice_name, practice_clickable in practices:
practice={}
practice['name']=practice_name
practice_clickable.click()
questions=get_questions(subject_name, specialization_name, practice_name)
practice['questions']=questions
driver.get(specialization_url)
driver.get(subject_url)
data.append(subject)
print(data)
scrape()
运行它会产生错误消息:
Traceback (most recent call last):
File "scrape.py", line 141, in <module>
scrape()
File "scrape.py", line 126, in scrape
for practice_name, practice_clickable in practices:
File "scrape.py", line 49, in <lambda>
practice_names=map(lambda x : x.text, driver.find_elements_by_xpath(practice_names_xpath))
File "C:\Users\Joseph\AppData\Local\Programs\Python\Python36\lib\site-packages\selenium\webdriver\remote\webelement.py", line 76, in text
return self._execute(Command.GET_ELEMENT_TEXT)['value']
File "C:\Users\Joseph\AppData\Local\Programs\Python\Python36\lib\site-packages\selenium\webdriver\remote\webelement.py", line 628, in _execute
return self._parent.execute(command, params)
File "C:\Users\Joseph\AppData\Local\Programs\Python\Python36\lib\site-packages\selenium\webdriver\remote\webdriver.py", line 312, in execute
self.error_handler.check_response(response)
File "C:\Users\Joseph\AppData\Local\Programs\Python\Python36\lib\site-packages\selenium\webdriver\remote\errorhandler.py", line 237, in check_response
raise exception_class(message, screen, stacktrace)
selenium.common.exceptions.NoSuchElementException: Message: Web element reference not seen before: 980e5c29-e3af-4b13-979f-0f2bb58b3480
从一次练习测试中得到问题后,驱动程序需要返回特化页面,在那里可以找到下一个练习测试。因此,这些行(问题所在):
for practice_name, practice_clickable in practices:
practice={}
practice['name']=practice_name
practice_clickable.click()
questions=get_questions(subject_name, specialization_name, practice_name)
practice['questions']=questions
driver.get(specialization_url)
显然,在页面上不再找到下一个可点击的练习。为什么不?
另外,我不确定这是否值得提出自己的问题,但是我无法让这个程序在我的 Ubuntu 计算机上运行——
get_questions
函数在最后一个问题上停止并且不会转到结果页面。这是盖伊正在尝试的一个初步建议:
def scrape():
setup()
subjects=get_subjects()
for subject_name, subject_clickable in subjects:
subject={}
subject['name']=subject_name
subject['specializations']=[]
subject_clickable.click()
subject_url=driver.current_url.replace('http://', 'https://')
specializations=get_specializations(subject_name)
for specialization_name, specialization_clickable in specializations:
specialization={}
specialization['name']=specialization_name
specialization['practices']=[]
specialization_clickable.click()
specialization_url=driver.current_url.replace('http://', 'https://')
practices=get_practices(subject_name, specialization_name)
practices_len = len(list(get_practices(subject_name, specialization_name)))
for i in range(practices_len):
practices_list = list(get_practices(subject_name, specialization_name))
practice = {}
practice['name'] = practices_list[i][0]
practices_list[i][1].click()
# for practice_name, practice_clickable in practices:
# practice={}
# practice['name']=practice_name
# practice_clickable.click()
# questions=get_questions(subject_name, specialization_name, practice_name)
# practice['questions']=questions
driver.get(specialization_url)
driver.get(subject_url)
data.append(subject)
print(data)
scrape()
编辑:根据休伯特的建议,我尝试了以下内容:
practices = get_practices(subject_name, specialization_name)
practices = [item[0] for item in practices]
for index, practice_name in enumerate(practices):
practice={}
practice['name'] = practice_name
practice_row = driver.find_element_by_xpath('//*[text()="'+practice_name+'"]/..')
practice_clickable_n = practice_row.find_element_by_link_text('Begin')
print('old:', practice_clickable[index])
print('new:', practice_clickable_n)
practice_clickable_n.click()
questions=get_questions(subject_name, specialization_name, practice_name)
这就是结果:
<map object at 0x7fabc0129860>
<map object at 0x7fabc0129898>
Traceback (most recent call last):
File "scrape.py", line 140, in <module>
scrape()
File "scrape.py", line 131, in scrape
print('old:', practice_clickable[index])
IndexError: list index out of range
最佳答案
这个错误信息...
selenium.common.exceptions.NoSuchElementException: Message: Web element reference not seen before: 980e5c29-e3af-4b13-979f-0f2bb58b3480
...暗示 GeckoDriver 无法识别 WebElement。
此错误来自
get(webEl, win)
中的 Marionette source code :get(webEl, win) {
if (!(webEl instanceof WebElement)) {
throw new TypeError(pprint`Expected web element, got: ${webEl}`);
}
if (!this.has(webEl)) {
throw new NoSuchElementError(
"Web element reference not seen before: " + webEl.uuid
);
}
@fc 在讨论中的 comment 'Element reference not seen before: undefined' using geckodriver, waitForElementVisible fails 解释了实际问题:
然而,核心问题在 Intermittent test_navigation.py TestRefresh.test_basic | NoSuchElementException: Failed to trigger opening a new tab: Web element reference not seen before 中讨论,随后通过 changeset 解决
解决方案
使用最新版本的二进制文件将解决以下问题:
关于python - selenium.common.exceptions.NoSuchElementException : Message: Web element reference not seen before using GeckoDriver Firefox and Selenium with Python,我们在Stack Overflow上找到一个类似的问题:https://stackoverflow.com/questions/59070019/