在Python中,使用多線程進行爬蟲并處理數據存儲可以通過以下幾個步驟實現:
import threading
import requests
from bs4 import BeautifulSoup
import json
import sqlite3
def create_connection():
conn = sqlite3.connect('data.db')
return conn
def create_table(conn):
cursor = conn.cursor()
cursor.execute('''CREATE TABLE IF NOT EXISTS web_data (
id INTEGER PRIMARY KEY AUTOINCREMENT,
url TEXT NOT NULL,
title TEXT NOT NULL,
content TEXT NOT NULL
)''')
conn.commit()
def process_data(url, title, content):
# 在這里可以對數據進行清洗、解析等操作
return {
'url': url,
'title': title,
'content': content
}
def save_data(conn, data):
cursor = conn.cursor()
cursor.execute('''INSERT INTO web_data (url, title, content)
VALUES (?, ?, ?)''', (data['url'], data['title'], data['content']))
conn.commit()
def crawl(url, title, conn):
try:
response = requests.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
content = soup.get_text()
data = process_data(url, title, content)
save_data(conn, data)
except Exception as e:
print(f"Error while processing {url}: {e}")
def start_threads(urls, num_threads):
conn = create_connection()
create_table(conn)
threads = []
for i in range(num_threads):
url = urls[i % len(urls)]
thread = threading.Thread(target=crawl, args=(url, url, conn))
threads.append(thread)
thread.start()
for thread in threads:
thread.join()
conn.close()
urls = [
'https://example.com/page1',
'https://example.com/page2',
# ...
]
num_threads = 10
start_threads(urls, num_threads)
這個示例使用了SQLite數據庫來存儲數據。你可以根據需要替換為其他數據庫,如MySQL、PostgreSQL等。同時,你可以根據需要調整數據處理和存儲的邏輯。