# Python Selenium如何爬取每日天氣
## 前言
在當今數據驅動的時代,獲取準確的天氣信息對日常生活、農業規劃、交通出行等領域都至關重要。傳統的人工查詢方式效率低下,而通過Python結合Selenium自動化工具,我們可以高效地爬取每日天氣數據。本文將詳細介紹如何使用Selenium構建一個穩定的天氣爬蟲系統。
## 一、環境準備
### 1.1 安裝必要庫
```python
pip install selenium beautifulsoup4 pandas
根據使用的瀏覽器版本下載對應的驅動: - Chrome: ChromeDriver - Firefox: GeckoDriver
將驅動文件放在系統PATH路徑或項目目錄下。
以中國天氣網(www.weather.com.cn)為例:
# 示例元素定位
temperature_xpath = '//div[@class="tem"]/span/text()'
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
def init_driver():
chrome_options = Options()
chrome_options.add_argument("--headless") # 無頭模式
chrome_options.add_argument("--disable-gpu")
driver = webdriver.Chrome(options=chrome_options)
return driver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
def get_weather_data(driver, city):
driver.get(f"http://www.weather.com.cn/weather/{city}.shtml")
try:
element = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.ID, "7d"))
)
return element.text
except TimeoutException:
print("頁面加載超時")
return None
from bs4 import BeautifulSoup
def parse_html(html):
soup = BeautifulSoup(html, 'html.parser')
weather_list = []
for item in soup.select('.t li'):
date = item.select_one('.date').get_text()
weather = item.select_one('.wea').get_text()
temp = item.select_one('.tem').get_text().replace('\n', '')
weather_list.append([date, weather, temp])
return weather_list
import csv
def save_to_csv(data, filename):
with open(filename, 'w', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(['日期', '天氣狀況', '溫度'])
writer.writerows(data)
import pymysql
def save_to_mysql(data):
conn = pymysql.connect(host='localhost',
user='root',
password='password',
database='weather')
cursor = conn.cursor()
sql = """INSERT INTO daily_weather
(record_date, weather, temperature)
VALUES (%s, %s, %s)"""
cursor.executemany(sql, data)
conn.commit()
conn.close()
import random
import time
time.sleep(random.uniform(1, 3))
chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64)")
使用APScheduler實現定時爬?。?/p>
from apscheduler.schedulers.blocking import BlockingScheduler
scheduler = BlockingScheduler()
@scheduler.scheduled_job('cron', hour=7)
def daily_job():
driver = init_driver()
data = get_weather_data(driver, '101010100') # 北京城市代碼
processed = parse_html(data)
save_to_csv(processed, 'weather.csv')
driver.quit()
scheduler.start()
import csv
import random
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
class WeatherSpider:
def __init__(self):
self.driver = self.init_driver()
def init_driver(self):
options = webdriver.ChromeOptions()
options.add_argument("--headless")
options.add_argument("user-agent=Mozilla/5.0")
driver = webdriver.Chrome(options=options)
return driver
def fetch_data(self, city_code):
url = f"http://www.weather.com.cn/weather/{city_code}.shtml"
self.driver.get(url)
try:
WebDriverWait(self.driver, 10).until(
EC.presence_of_element_located((By.ID, "7d"))
)
time.sleep(random.uniform(1, 2))
return self.driver.page_source
except Exception as e:
print(f"Error occurred: {str(e)}")
return None
def parse_data(self, html):
soup = BeautifulSoup(html, 'html.parser')
results = []
for day in soup.select('.t li'):
try:
date = day.select_one('.date').get_text().strip()
weather = day.select_one('.wea').get_text().strip()
temp = day.select_one('.tem').get_text().replace('\n', '').strip()
results.append([date, weather, temp])
except AttributeError:
continue
return results
def save_data(self, data, filename='weather.csv'):
with open(filename, 'a', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
if f.tell() == 0:
writer.writerow(['日期', '天氣', '溫度'])
writer.writerows(data)
def run(self, city_code):
html = self.fetch_data(city_code)
if html:
data = self.parse_data(html)
self.save_data(data)
print(f"成功獲取{len(data)}條天氣數據")
else:
print("數據獲取失敗")
def close(self):
self.driver.quit()
if __name__ == "__main__":
spider = WeatherSpider()
try:
spider.run('101010100') # 北京城市代碼
finally:
spider.close()
A: 使用顯式等待結合Selenium的等待機制,或分析AJAX請求接口
A: 可以考慮: 1. 降低請求頻率 2. 使用第三方打碼平臺 3. 切換數據源
A: 建議: 1. 檢查網站更新頻率 2. 設置合理的爬取間隔 3. 使用網站提供的API(如有)
通過本文的講解,我們系統性地掌握了使用Python+Selenium爬取每日天氣的完整流程。在實際應用中,建議: 1. 遵守網站的robots.txt協議 2. 控制爬取頻率避免給服務器造成壓力 3. 定期維護代碼以適應網站改版
希望本文能幫助您構建穩定可靠的天氣數據采集系統,為后續的數據分析和應用打下堅實基礎。
附錄:國內主要城市天氣代碼示例
城市 | 代碼 |
---|---|
北京 | 101010100 |
上海 | 101020100 |
廣州 | 101280101 |
深圳 | 101280601 |
”`
注:本文代碼示例需要根據實際目標網站結構調整,實際字符數約3700字左右。建議運行時根據具體網站元素修改定位方式,并遵守相關網站的使用條款。
免責聲明:本站發布的內容(圖片、視頻和文字)以原創、轉載和分享為主,文章觀點不代表本網站立場,如果涉及侵權請聯系站長郵箱:is@yisu.com進行舉報,并提供相關證據,一經查實,將立刻刪除涉嫌侵權內容。