Python + Selenium 動態爬蟲
試用 Pythone 試作簡單的動態爬蟲。Jupyter Notebook, Selenium, WebDriver.
安裝 Selenium
pip install selenium
前置安裝請參考
Python 第一支爬蟲(Jupyter Notebook)指令紀錄
開啟 Google 搜尋網頁
# 載入需要的套件
from selenium import webdriver
# 開啟瀏覽器視窗(Chrome)
driver = webdriver.Chrome()
# 開啟相標網頁:Google 搜尋引擎
driver.get("https://www.google.com/")
填入關鍵字並送出搜尋
from selenium.webdriver.common.by import By
# 定位搜尋輸入框
element = driver.find_element(By.XPATH,"/html/body/div[1]/div[3]/form/div[1]/div[1]/div[1]/div/div[2]/textarea")
# 填入搜尋關鍵字
element.send_keys("Python 教學")
# 送出搜尋
searchForm = driver.find_element(By.XPATH,"/html/body/div[1]/div[3]/form")
searchForm.submit()
解析搜尋結果/清單
infoList = driver.find_elements(By.CSS_SELECTOR, "div.MjjYud")
for info in infoList:
try:
title = info.find_element(By.CSS_SELECTOR, "h3.LC20lb")
print(title.get_attribute("innerHTML"))
link = info.find_element(By.CSS_SELECTOR, "a[jsname=UWckNb]")
print(link.get_attribute("href"))
except:
pass #noop
關閉瀏覽器視窗
# 關閉瀏覽器視窗
driver.close()
Google 搜尋引擎網頁不同版本有不同 HTML layout 請依當時狀況變更 XPATH 等定位參數。
沒圖沒真象

參考資料
動態網頁爬蟲第一道鎖 — Selenium教學:如何使用Webdriver、send_keys(附Python 程式碼) ------ 指令已過時,開發流程仍有參考性。
AttributeError: 'WebDriver' object has no attribute 'find_element_by_xpath' ------ 新舊版指令遷移,如:
.find_element_by_class_name(
.find_element(By.CLASS_NAME,
.find_element_by_css_selector(
.find_element(By.CSS_SELECTOR,
.find_element_by_id(
.find_element(By.ID,
.find_element_by_link_text(
.find_element(By.LINK_TEXT,
.find_element_by_name(
.find_element(By.NAME,
.find_element_by_partial_link_text(
.find_element(By.PARTIAL_LINK_TEXT,
.find_element_by_tag_name(
.find_element(By.TAG_NAME,
.find_element_by_xpath(
.find_element(By.XPATH,
.find_elements_by_class_name(
.find_elements(By.CLASS_NAME,
.find_elements_by_css_selector(
.find_elements(By.CSS_SELECTOR,
.find_elements_by_id(
.find_elements(By.ID,
.find_elements_by_link_text(
.find_elements(By.LINK_TEXT,
.find_elements_by_name(
.find_elements(By.NAME,
.find_elements_by_partial_link_text(
.find_elements(By.PARTIAL_LINK_TEXT,
.find_elements_by_tag_name(
.find_elements(By.TAG_NAME,
.find_elements_by_xpath(
.find_elements(By.XPATH,
(EOF)
Last updated