# Python怎么實現分類保存所有文章圖片
## 引言
在信息爆炸的時代,我們每天都會接觸到大量包含圖片的文章內容。無論是技術博客、新聞網站還是個人文集,如何高效地提取并分類保存這些圖片成為許多人的需求。本文將詳細介紹如何用Python實現從文章中提取圖片并按自定義規則分類保存的全過程。
## 一、需求分析與技術選型
### 1.1 核心需求
- 從HTML/網頁中提取所有圖片資源
- 按預設分類規則(如主題、日期、來源等)自動歸檔
- 支持本地和網絡資源的抓取
- 保持原始圖片質量不損失
### 1.2 技術棧選擇
```python
主要庫:
- requests/urllib:網絡請求
- BeautifulSoup:HTML解析
- Pillow(PIL):圖像處理
- os/pathlib:文件系統操作
pip install requests beautifulsoup4 pillow
import requests
from bs4 import BeautifulSoup
import os
def download_images(url, save_dir):
try:
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
img_tags = soup.find_all('img')
if not os.path.exists(save_dir):
os.makedirs(save_dir)
for i, img in enumerate(img_tags):
img_url = img.get('src')
if not img_url.startswith('http'):
img_url = url + img_url if img_url.startswith('/') else url + '/' + img_url
try:
img_data = requests.get(img_url).content
with open(f"{save_dir}/image_{i}.jpg", 'wb') as f:
f.write(img_data)
except Exception as e:
print(f"下載失敗 {img_url}: {str(e)}")
except Exception as e:
print(f"處理失敗: {str(e)}")
def classify_by_context(img_tag):
parent = img_tag.find_parent()
h2 = parent.find_previous_sibling('h2')
if h2:
return h2.text.strip().replace(' ', '_')
return 'uncategorized'
def classify_by_alt(img_tag):
alt = img_tag.get('alt', '').lower()
categories = {
'logo': 'brand',
'graph': 'charts',
'photo': 'photography'
}
for kw in categories:
if kw in alt:
return categories[kw]
return 'others'
import cv2
import numpy as np
def classify_by_color(img_path):
img = cv2.imread(img_path)
avg_color = np.mean(img, axis=(0,1))
if avg_color[0] > 200: # 偏藍色調
return 'cool_tone'
elif avg_color[1] > 200: # 偏綠色調
return 'nature'
else:
return 'general'
from pathlib import Path
from urllib.parse import urljoin
import hashlib
import time
class ImageClassifier:
def __init__(self, base_url):
self.base_url = base_url
self.session = requests.Session()
self.session.headers.update({
'User-Agent': 'Mozilla/5.0'
})
def get_image_hash(self, content):
return hashlib.md5(content).hexdigest()[:8]
def process_page(self, classification_rules=None):
try:
resp = self.session.get(self.base_url)
soup = BeautifulSoup(resp.text, 'html.parser')
for img in soup.find_all('img'):
img_url = img.get('src')
if not img_url:
continue
# 處理相對URL
img_url = urljoin(self.base_url, img_url)
# 獲取分類
category = 'uncategorized'
if classification_rules:
for rule in classification_rules:
category = rule(img)
if category != 'uncategorized':
break
# 下載圖片
try:
img_data = self.session.get(img_url, timeout=10).content
img_hash = self.get_image_hash(img_data)
timestamp = int(time.time())
save_path = Path(f"images/{category}")
save_path.mkdir(exist_ok=True)
ext = img_url.split('.')[-1].lower()
ext = ext if ext in ['jpg','png','gif'] else 'jpg'
with open(save_path/f"{timestamp}_{img_hash}.{ext}", 'wb') as f:
f.write(img_data)
except Exception as e:
print(f"下載失敗 {img_url}: {str(e)}")
except Exception as e:
print(f"頁面處理錯誤: {str(e)}")
# 使用示例
rules = [classify_by_alt, classify_by_context]
classifier = ImageClassifier("https://example.com/blog")
classifier.process_page(rules)
def check_duplicate(content, save_dir):
img_hash = hashlib.md5(content).hexdigest()
for f in os.listdir(save_dir):
if img_hash in f:
return True
return False
from tenacity import retry, stop_after_attempt
@retry(stop=stop_after_attempt(3))
def download_with_retry(url):
return requests.get(url, timeout=5)
proxies = {
'http': 'http://proxy.example.com:8080',
'https': 'https://proxy.example.com:8080'
}
response = requests.get(url, proxies=proxies)
concurrent.futures加速from concurrent.futures import ThreadPoolExecutor
with ThreadPoolExecutor(max_workers=5) as executor:
executor.map(download_image, img_urls)
緩存機制:對已處理的URL建立緩存數據庫
增量抓取:記錄最后處理時間,只抓取新內容
class ImageDownloadError(Exception):
pass
def safe_download(url):
try:
resp = requests.get(url, timeout=10)
resp.raise_for_status()
return resp.content
except requests.exceptions.RequestException as e:
raise ImageDownloadError(f"下載失敗 {url}") from e
import logging
logging.basicConfig(
filename='image_downloader.log',
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)
logging.info(f"開始處理 {url}")
def process_blog(url):
# 按年月分類
year_month = datetime.now().strftime("%Y-%m")
save_dir = f"blog_images/{year_month}"
# 添加自定義分類規則
def blog_classifier(img_tag):
if 'avatar' in img_tag.get('class', []):
return 'avatars'
return year_month
classifier = ImageClassifier(url)
classifier.process_page([blog_classifier])
news_rules = [
lambda img: 'news' if 'news' in img.find_parent('article').get('class', '') else None,
lambda img: 'ads' if 'advertisement' in img.get('alt', '').lower() else None
]
本文詳細介紹了使用Python實現文章圖片分類保存的完整方案,包括:
通過靈活組合這些技術,你可以構建出適應不同場景的圖片收集系統。后續可以進一步擴展的功能包括: - 集成機器學習自動打標 - 增加GUI操作界面 - 開發瀏覽器插件版本
提示:完整項目代碼已托管在GitHub: example.com/image-classifier “`
注:本文實際約2200字,由于Markdown格式的代碼塊和空行不計入標準字數統計,如需精確字數控制可適當增減說明性文本。
免責聲明:本站發布的內容(圖片、視頻和文字)以原創、轉載和分享為主,文章觀點不代表本網站立場,如果涉及侵權請聯系站長郵箱:is@yisu.com進行舉報,并提供相關證據,一經查實,將立刻刪除涉嫌侵權內容。