硒有时找不到元素
问题描述
我正在尝试抓取URL列表:
https://www.jobsbank.gov.sg/ICMSPortal/portlets/JobBankHandler/SearchDetail.do?id=JOB-2016-0010810 https://www.jobsbank.gov.sg/ICMSPortal/portlets/JobBankHandler/SearchDetail.do?id=JOB-2015-0374997
将此代码与Selify WebDriver配合使用
def jobdesc(link):
driver = webdriver.Firefox()
driver.get(link)
jobname = driver.find_element_by_xpath('//*[@id="wrapper"]/div[1]/div/div[3]/div/div[2]/div[1]/div[2]/div[1]/h3').text
print "job title" + " " + jobname
postingdate = driver.find_element_by_xpath('//*[@id="wrapper"]/div[1]/div/div[3]/div/div[2]/div[1]/div[2]/div[1]/table/tbody/tr/td[1]/div/p[1]').text
print "posting date" + " " + postingdate
expirydate = driver.find_element_by_xpath('//*[@id="wrapper"]/div[1]/div/div[3]/div/div[2]/div[1]/div[2]/div[1]/table/tbody/tr/td[1]/div/p[2]').text
print "expiry date" + " " + expirydate
jobid = driver.find_element_by_xpath('//*[@id="wrapper"]/div[1]/div/div[3]/div/div[2]/div[1]/div[2]/div[1]/table/tbody/tr/td[2]/div/p').text
print "job id" + " " + jobid
location = driver.find_element_by_xpath('//*[@id="wrapper"]/div[1]/div/div[3]/div/div[2]/div[2]/div/dl[1]/table/tbody/tr/td/p[1]').text
print "location" + " " + location
category = driver.find_element_by_xpath('//*[@id="wrapper"]/div[1]/div/div[3]/div/div[2]/div[2]/div/dl[2]/table/tbody/tr/td/ul/li/span[1]').text
print "category" + " " + category
industry = driver.find_element_by_xpath('//*[@id="wrapper"]/div[1]/div/div[3]/div/div[2]/div[2]/div/dl[3]/ul/li/table/tbody/tr/td/ul/li/span[1]').text
print "industry" + " " + industry
emptype = driver.find_element_by_xpath('//*[@id="wrapper"]/div[1]/div/div[3]/div/div[2]/div[2]/div/ul[1]/li/table/tbody/tr/td/ul/li/span[1]').text
print "emptype" + " " + emptype
workhrs = driver.find_element_by_xpath('//*[@id="wrapper"]/div[1]/div/div[3]/div/div[2]/div[2]/div/ul[2]/li/span[1]').text
print "workhrs" + " " + workhrs
shiftpattern = driver.find_element_by_xpath('//*[@id="wrapper"]/div[1]/div/div[3]/div/div[2]/div[2]/div/ul[3]/li/span[1]').text
print "shiftpattern" + " " + shiftpattern
salary = driver.find_element_by_xpath('//*[@id="wrapper"]/div[1]/div/div[3]/div/div[2]/div[2]/div/ul[4]/li/table/tbody/tr/td/span[1]').text
print "salary" + " " + salary
joblevel = driver.find_element_by_xpath('//*[@id="wrapper"]/div[1]/div/div[3]/div/div[2]/div[2]/div/ul[5]/li/table/tbody/tr/td/ul/li/span[1]').text
print "joblevel" + " " + joblevel
yearsofexp = driver.find_element_by_xpath('//*[@id="wrapper"]/div[1]/div/div[3]/div/div[2]/div[2]/div/ul[6]/li').text
print "yearofexp" + " " + yearsofexp
vacancyno = driver.find_element_by_xpath('//*[@id="wrapper"]/div[1]/div/div[3]/div/div[2]/div[2]/div/span[2]').text
print "vacancyno" + " " + vacancyno
jobadviews = driver.find_element_by_xpath('//*[@id="wrapper"]/div[1]/div/div[3]/div/div[2]/div[2]/div/span[4]').text
print "jobadviews" + " " + jobadviews
jobapplicants = driver.find_element_by_xpath('//*[@id="wrapper"]/div[1]/div/div[3]/div/div[2]/div[2]/div/span[6]').text
print "nojobapplicants" + " " + jobapplicants
driver.switch_to_frame('frameJobDescription')
jobdesc = driver.page_source
jobdesctree = html.fromstring(jobdesc)
jobdesctrees = jobdesctree.xpath('//*[@id="divMainJobDescription"]//text()')
print jobdesctrees
driver.switch_to_default_content()
driver.switch_to_frame('frameSkillsRequired')
jobskills = driver.page_source
jobskillstree = html.fromstring(jobskills)
jobskillstrees = jobskillstree.xpath('////*[@id="divMainSkillsRequired"]//text()')
print jobskillstrees
有时,当我试图抓取一个URL时,Selify能够检索所有必需的字段。然而,当我把抓取放到一个尝试所有url的循环中时,它很快就停止在第二个url中工作,并返回这个错误:
> Traceback (most recent call last): File "E:jobsbank15012016.py",
> line 123, in <module>
> main() File "E:jobsbank15012016.py", line 120, in main
> jobdesc(url) File "E:jobsbank15012016.py", line 59, in jobdesc
> postingdate = driver.find_element_by_xpath('//*[@id="wrapper"]/div[1]/div/div[3]/div/div[2]/div[1]/div[2]/div[1]/table/tbody/tr/td[1]/div/p[1]').text
> File
> "C:Python27libsite-packagesseleniumwebdriveremotewebdriver.py",
> line 252, in find_element_by_xpath
> return self.find_element(by=By.XPATH, value=xpath) File "C:Python27libsite-packagesseleniumwebdriveremotewebdriver.py",
> line 684, in find_element
> {'using': by, 'value': value})['value'] File "C:Python27libsite-packagesseleniumwebdriveremotewebdriver.py",
> line 195, in execute
> self.error_handler.check_response(response) File "C:Python27libsite-packagesseleniumwebdriveremoteerrorhandler.py",
> line 170, in check_response
> raise exception_class(message, screen, stacktrace) NoSuchElementException: Message: Unable to locate element:
> {"method":"xpath","selector":"//*[@id="wrapper"]/div[1]/div/div[3]/div/div[2]/div[1]/div[2]/div[1]/table/tbody/tr/td[1]/div/p[1]"}
> Stacktrace:
> at FirefoxDriver.prototype.findElementInternal_ (file:///c:/users/anon/appdata/local/temp/tmprizeob/extensions/fxdriver@googlecode.com/components/driver-component.js:10299)
> at FirefoxDriver.prototype.findElement (file:///c:/users/anon/appdata/local/temp/tmprizeob/extensions/fxdriver@googlecode.com/components/driver-component.js:10308)
> at DelayedCommand.prototype.executeInternal_/h (file:///c:/users/anon/appdata/local/temp/tmprizeob/extensions/fxdriver@googlecode.com/components/command-processor.js:12282)
> at DelayedCommand.prototype.executeInternal_ (file:///c:/users/anon/appdata/local/temp/tmprizeob/extensions/fxdriver@googlecode.com/components/command-processor.js:12287)
> at DelayedCommand.prototype.execute/< (file:///c:/users/anon/appdata/local/temp/tmprizeob/extensions/fxdriver@googlecode.com/components/command-processor.js:12229)
解决方案
第一个问题是,您应该通过添加wait为页面加载留出一些时间。您还应该简化您的定位器,并使它们不那么依赖于HTML结构:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Firefox()
driver.get(link)
# wait until title is present
wait = WebDriverWait(driver, 10)
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "h3.title")))
job_name = driver.find_element_by_css_selector("h3.title").text
在这里,对于职位,我们使用一个简单的css选择器--基本上找到一个带有title
类的h3
元素。此处的XPath替代方案是//h3[contains(@class, 'title')]
。
相关文章