# 怎么用Python采集北京二手房數據
在當今大數據時代,房產數據對于投資者、購房者和研究人員都具有重要價值。本文將詳細介紹如何使用Python技術棧采集北京二手房數據,涵蓋從環境準備到數據存儲的完整流程。
## 一、準備工作
### 1.1 技術選型
我們主要使用以下Python庫:
- **requests**:發送HTTP請求
- **BeautifulSoup**/lxml:HTML解析
- **pandas**:數據處理
- **selenium**:處理動態加載內容
- **MongoDB**/MySQL:數據存儲
```python
# 安裝必要庫
pip install requests beautifulsoup4 pandas selenium pymongo mysql-connector-python
以鏈家網為例(https://bj.lianjia.com/ershoufang/),我們需要: 1. 分析URL結構 2. 查看頁面加載方式(靜態/動態) 3. 檢查反爬機制(驗證碼、請求頻率限制等)
使用瀏覽器開發者工具(F12)查看: - 網絡請求 - 數據返回格式(HTML/JSON) - 關鍵數據所在標簽
import requests
from bs4 import BeautifulSoup
import pandas as pd
def get_page(page_num):
url = f"https://bj.lianjia.com/ershoufang/pg{page_num}/"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'lxml')
house_list = []
for item in soup.select('.sellListContent li'):
try:
title = item.select_one('.title a').text
price = item.select_one('.totalPrice').text
unit_price = item.select_one('.unitPrice').text
house_info = item.select('.houseInfo')[0].text.split('|')
house_list.append({
'title': title,
'price': float(price.replace('萬', '')),
'unit_price': float(unit_price.replace('元/平', '').replace(',', '')),
'district': house_info[0].strip(),
'area': float(house_info[1].replace('平米', '').strip()),
'layout': house_info[2].strip()
})
except Exception as e:
print(f"解析錯誤: {e}")
continue
return pd.DataFrame(house_list)
# 測試單頁采集
df = get_page(1)
print(df.head())
def get_multiple_pages(start_page, end_page):
all_data = []
for page in range(start_page, end_page+1):
print(f"正在采集第{page}頁...")
try:
df = get_page(page)
all_data.append(df)
time.sleep(random.uniform(1, 3)) # 隨機延遲
except Exception as e:
print(f"第{page}頁采集失敗: {e}")
return pd.concat(all_data, ignore_index=True)
# 采集前10頁數據
data = get_multiple_pages(1, 10)
當遇到JavaScript渲染的頁面時,需要使用selenium:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
def get_dynamic_page(url):
chrome_options = Options()
chrome_options.add_argument('--headless') # 無頭模式
driver = webdriver.Chrome(options=chrome_options)
driver.get(url)
# 等待元素加載
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
try:
element = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CLASS_NAME, "sellListContent"))
soup = BeautifulSoup(driver.page_source, 'lxml')
# 后續解析邏輯...
finally:
driver.quit()
常見應對策略: 1. User-Agent輪換
user_agents = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64)...",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)..."
]
headers = {'User-Agent': random.choice(user_agents)}
proxies = {
'http': 'http://127.0.0.1:1080',
'https': 'https://127.0.0.1:1080'
}
requests.get(url, proxies=proxies)
import time
time.sleep(random.uniform(0.5, 2))
from pymongo import MongoClient
def save_to_mongodb(data):
client = MongoClient('mongodb://localhost:27017/')
db = client['real_estate']
collection = db['beijing_ershou']
# 轉換為字典格式
records = data.to_dict('records')
collection.insert_many(records)
import mysql.connector
def save_to_mysql(data):
conn = mysql.connector.connect(
host="localhost",
user="root",
password="password",
database="real_estate"
)
cursor = conn.cursor()
create_table = """
CREATE TABLE IF NOT EXISTS beijing_ershou (
id INT AUTO_INCREMENT PRIMARY KEY,
title VARCHAR(255),
price FLOAT,
unit_price FLOAT,
district VARCHAR(50),
area FLOAT,
layout VARCHAR(50),
crawl_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
"""
cursor.execute(create_table)
insert_sql = """
INSERT INTO beijing_ershou
(title, price, unit_price, district, area, layout)
VALUES (%s, %s, %s, %s, %s, %s)
"""
for _, row in data.iterrows():
cursor.execute(insert_sql, (
row['title'], row['price'], row['unit_price'],
row['district'], row['area'], row['layout']
))
conn.commit()
conn.close()
def clean_data(df):
# 處理缺失值
df = df.dropna()
# 去重
df = df.drop_duplicates(subset=['title'])
# 數據類型轉換
df['price'] = df['price'].astype(float)
df['unit_price'] = df['unit_price'].astype(float)
return df
cleaned_data = clean_data(data)
# 各區域平均單價
district_avg = cleaned_data.groupby('district')['unit_price'].mean().sort_values(ascending=False)
# 價格分布
price_bins = [0, 200, 300, 400, 500, 600, 1000, float('inf')]
price_labels = ['0-200', '200-300', '300-400', '400-500', '500-600', '600-1000', '1000+']
cleaned_data['price_range'] = pd.cut(cleaned_data['price'], bins=price_bins, labels=price_labels)
beijing-housing-spider/
├── config/ # 配置文件
│ ├── db_config.py # 數據庫配置
│ └── user_agents.py # User-Agent列表
├── spiders/ # 爬蟲核心
│ ├── base_spider.py # 基礎爬蟲類
│ ├── lianjia.py # 鏈家爬蟲
│ └── beike.py # 貝殼爬蟲
├── utils/ # 工具類
│ ├── proxy.py # 代理工具
│ └── logger.py # 日志工具
├── storage/ # 存儲模塊
│ ├── mongodb.py
│ └── mysql.py
└── main.py # 主程序入口
# 示例:使用APScheduler設置定時任務
from apscheduler.schedulers.blocking import BlockingScheduler
def daily_job():
data = get_multiple_pages(1, 5)
save_to_mongodb(data)
scheduler = BlockingScheduler()
scheduler.add_job(daily_job, 'cron', hour=2) # 每天凌晨2點執行
scheduler.start()
通過本文介紹的方法,您可以構建一個完整的北京二手房數據采集系統。建議在實際應用中逐步完善異常處理、日志記錄等功能,使爬蟲更加健壯可靠。 “`
(注:實際字數約2400字,此處為精簡展示版本。完整版可擴展各章節細節、增加錯誤處理案例和可視化示例等內容)
免責聲明:本站發布的內容(圖片、視頻和文字)以原創、轉載和分享為主,文章觀點不代表本網站立場,如果涉及侵權請聯系站長郵箱:is@yisu.com進行舉報,并提供相關證據,一經查實,將立刻刪除涉嫌侵權內容。