# 怎么用Python爬取美團旅游景點評論數據
在旅游行業大數據分析中,用戶評論數據蘊含著巨大的商業價值。本文將詳細介紹如何使用Python爬取美團旅游景點評論數據,包括技術選型、反爬策略和完整代碼實現。
## 一、爬蟲技術選型
### 1.1 核心工具
- **Requests**:處理HTTP請求
- **BeautifulSoup**/lxml:HTML解析
- **Selenium**:應對動態渲染頁面
- **PyMySQL**:數據存儲
### 1.2 輔助工具
```python
pip install requests beautifulsoup4 selenium pymysql
美團景點評論頁典型URL格式:
https://www.meituan.com/meishi/123456/review/all/
其中123456
為景點ID
import requests
def get_page(poi_id, page):
url = f"https://www.meituan.com/meishi/{poi_id}/review/all/"
params = {
"pageno": page,
"sortType": "default"
}
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)",
"Referer": "https://www.meituan.com/"
}
try:
response = requests.get(url, params=params, headers=headers)
response.raise_for_status()
return response.text
except requests.RequestException as e:
print(f"請求失敗: {e}")
return None
from bs4 import BeautifulSoup
def parse_comments(html):
soup = BeautifulSoup(html, 'lxml')
comments = []
for item in soup.select('.reviews-list .review-item'):
try:
comment = {
'username': item.select_one('.name').text.strip(),
'score': int(item.select_one('.score').text),
'content': item.select_one('.desc').text.strip(),
'visit_time': item.select_one('.time').text.split(':')[-1],
'like_count': int(item.select_one('.like').text or 0)
}
comments.append(comment)
except Exception as e:
print(f"解析出錯: {e}")
return comments
當發現直接請求無法獲取數據時,需要使用Selenium:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
def get_dynamic_page(poi_id, page):
chrome_options = Options()
chrome_options.add_argument('--headless')
driver = webdriver.Chrome(options=chrome_options)
url = f"https://www.meituan.com/meishi/{poi_id}/review/all/?pageno={page}"
driver.get(url)
driver.implicitly_wait(5)
html = driver.page_source
driver.quit()
return html
import time
import random
def safe_request(url):
time.sleep(random.uniform(1, 3))
return requests.get(url)
CREATE TABLE `meituan_comments` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`poi_id` varchar(20) NOT NULL,
`username` varchar(50) DEFAULT NULL,
`score` tinyint(1) DEFAULT NULL,
`content` text,
`visit_time` varchar(50) DEFAULT NULL,
`like_count` int(11) DEFAULT 0,
`create_time` timestamp NULL DEFAULT CURRENT_TIMESTAMP,
PRIMARY KEY (`id`),
KEY `idx_poi` (`poi_id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
import pymysql
def save_to_db(comments, poi_id):
conn = pymysql.connect(
host='localhost',
user='root',
password='yourpassword',
db='spider_data',
charset='utf8mb4'
)
try:
with conn.cursor() as cursor:
sql = """INSERT INTO meituan_comments
(poi_id, username, score, content, visit_time, like_count)
VALUES (%s, %s, %s, %s, %s, %s)"""
for comment in comments:
cursor.execute(sql, (
poi_id,
comment['username'],
comment['score'],
comment['content'],
comment['visit_time'],
comment['like_count']
))
conn.commit()
finally:
conn.close()
def main():
poi_id = "123456" # 實際景點ID
max_page = 10 # 最大爬取頁數
for page in range(1, max_page+1):
print(f"正在爬取第{page}頁...")
# 優先嘗試普通請求
html = get_page(poi_id, page)
if not html or "驗證碼" in html:
html = get_dynamic_page(poi_id, page)
if html:
comments = parse_comments(html)
save_to_db(comments, poi_id)
print(f"成功保存{len(comments)}條評論")
else:
print(f"第{page}頁爬取失敗")
time.sleep(random.uniform(2, 5))
if __name__ == "__main__":
main()
通過以上方法,您可以有效獲取美團旅游景點評論數據。建議單日爬取量控制在1000條以內,避免對目標網站造成負擔。獲取的數據可用于景點服務質量分析、用戶偏好研究等商業智能場景。 “`
注:實際使用時需要根據美團網頁結構變化調整CSS選擇器,并確保遵守相關法律法規。建議在爬取前檢查網站的服務條款。
免責聲明:本站發布的內容(圖片、視頻和文字)以原創、轉載和分享為主,文章觀點不代表本網站立場,如果涉及侵權請聯系站長郵箱:is@yisu.com進行舉報,并提供相關證據,一經查實,將立刻刪除涉嫌侵權內容。