# 如何用Python爬取酷我音樂
## 前言
在當今數字音樂時代,音樂平臺如酷我音樂擁有海量的正版音樂資源。作為Python開發者,我們可能希望獲取這些音樂數據用于個人學習、數據分析或開發第三方應用。本文將詳細介紹如何使用Python爬蟲技術爬取酷我音樂的數據,包括歌曲信息、歌詞以及音頻文件等。
**請注意**:本文僅用于技術交流和學習,請遵守相關法律法規和酷我音樂的用戶協議,不得將爬取的數據用于商業用途或侵犯版權。
## 一、環境準備
在開始之前,我們需要準備以下Python環境和庫:
```python
# 基礎庫
import requests # 用于發送HTTP請求
from bs4 import BeautifulSoup # 用于解析HTML
import json # 處理JSON數據
import re # 正則表達式
import os # 文件操作
import time # 時間控制
import random # 隨機數生成
# 可選高級庫
import selenium # 用于處理動態加載內容
from fake_useragent import UserAgent # 生成隨機User-Agent
安裝這些庫可以使用pip命令:
pip install requests beautifulsoup4 selenium fake-useragent
首先我們需要了解酷我音樂的網頁結構:
通過分析我們發現:
- 搜索API:https://www.kuwo.cn/api/www/search/searchMusicBykeyWord?key=關鍵詞
- 需要處理反爬機制(Cookie、Referer、csrf等)
移動端API通常限制較少,我們可以嘗試:
- 使用抓包工具(如Charles或Fiddler)分析手機APP請求
- 發現核心API:http://m.kuwo.cn/newh5/singles/songinfoandlrc?musicId=歌曲ID
def search_song(keyword):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
'Cookie': '你的酷我Cookie',
'Referer': 'https://www.kuwo.cn/',
'csrf': '你的CSRF Token'
}
url = f'https://www.kuwo.cn/api/www/search/searchMusicBykeyWord?key={keyword}'
response = requests.get(url, headers=headers)
if response.status_code == 200:
data = response.json()
return data['data']['list']
else:
print(f"搜索失敗,狀態碼:{response.status_code}")
return None
def get_song_detail(music_id):
url = f'http://www.kuwo.cn/api/www/music/musicInfo?mid={music_id}'
headers = {
'User-Agent': UserAgent().random,
'Referer': f'https://www.kuwo.cn/play_detail/{music_id}'
}
try:
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.json()
else:
print(f"獲取歌曲詳情失敗,狀態碼:{response.status_code}")
return None
except Exception as e:
print(f"發生錯誤:{str(e)}")
return None
酷我音樂有較強的反爬措施,我們需要處理:
def get_random_headers():
ua = UserAgent()
return {
'User-Agent': ua.random,
'Accept': 'application/json, text/plain, */*',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'Connection': 'keep-alive',
'Referer': 'https://www.kuwo.cn/',
}
PROXY_POOL = [
'http://123.456.789.012:8888',
'http://112.113.114.115:9999',
# 更多代理IP...
]
def get_with_proxy(url):
proxy = {'http': random.choice(PROXY_POOL)}
try:
response = requests.get(url, proxies=proxy, timeout=10)
return response
except:
return None
def safe_request(url, max_retry=3):
for i in range(max_retry):
try:
time.sleep(random.uniform(0.5, 2.0)) # 隨機延遲
response = requests.get(url, headers=get_random_headers())
if response.status_code == 200:
return response
except Exception as e:
print(f"請求失敗,重試 {i+1}/{max_retry}: {str(e)}")
return None
def get_audio_url(music_id):
url = f'http://www.kuwo.cn/url?format=mp3&rid={music_id}&type=convert_url3'
response = safe_request(url)
if response:
data = response.json()
return data.get('url')
return None
def download_music(music_id, save_path='./musics'):
if not os.path.exists(save_path):
os.makedirs(save_path)
audio_url = get_audio_url(music_id)
if not audio_url:
print("無法獲取音頻URL")
return False
try:
response = requests.get(audio_url, stream=True)
file_path = os.path.join(save_path, f'{music_id}.mp3')
with open(file_path, 'wb') as f:
for chunk in response.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
print(f"下載成功:{file_path}")
return True
except Exception as e:
print(f"下載失?。簕str(e)}")
return False
def get_lyrics(music_id):
url = f'http://m.kuwo.cn/newh5/singles/songinfoandlrc?musicId={music_id}'
response = safe_request(url)
if response:
data = response.json()
if data.get('status') == 200:
return data.get('data', {}).get('lrclist', [])
return None
def save_lyrics(music_id, lyrics, save_path='./lyrics'):
if not os.path.exists(save_path):
os.makedirs(save_path)
file_path = os.path.join(save_path, f'{music_id}.lrc')
with open(file_path, 'w', encoding='utf-8') as f:
for line in lyrics:
f.write(f"[{line['time']}]{line['lineLyric']}\n")
print(f"歌詞保存成功:{file_path}")
class KuWoMusicSpider:
def __init__(self):
self.session = requests.Session()
self.session.headers.update(get_random_headers())
def search(self, keyword, page=1, size=30):
"""搜索歌曲"""
params = {
'key': keyword,
'pn': page,
'rn': size
}
url = 'https://www.kuwo.cn/api/www/search/searchMusicBykeyWord'
response = self.session.get(url, params=params)
return response.json() if response.ok else None
def get_music_info(self, music_id):
"""獲取歌曲詳細信息"""
url = f'http://www.kuwo.cn/api/www/music/musicInfo?mid={music_id}'
response = self.session.get(url)
return response.json() if response.ok else None
def download(self, music_id, save_dir='downloads'):
"""下載音樂"""
# 獲取音樂信息
info = self.get_music_info(music_id)
if not info:
return False
# 創建保存目錄
artist = info['data']['artist'].replace('/', '_')
album = info['data']['album'].replace('/', '_')
save_path = os.path.join(save_dir, artist, album)
os.makedirs(save_path, exist_ok=True)
# 下載音頻
audio_url = get_audio_url(music_id)
if not audio_url:
return False
file_name = f"{info['data']['name']}_{music_id}.mp3"
file_path = os.path.join(save_path, file_name)
# 下載文件
response = requests.get(audio_url, stream=True)
with open(file_path, 'wb') as f:
for chunk in response.iter_content(1024):
f.write(chunk)
# 下載歌詞
lyrics = get_lyrics(music_id)
if lyrics:
lrc_file = os.path.join(save_path, f"{info['data']['name']}_{music_id}.lrc")
with open(lrc_file, 'w', encoding='utf-8') as f:
for line in lyrics:
f.write(f"[{line['time']}]{line['lineLyric']}\n")
return True
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
def get_dynamic_content(url):
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
driver = webdriver.Chrome(options=chrome_options)
driver.get(url)
time.sleep(3) # 等待頁面加載
page_source = driver.page_source
driver.quit()
return page_source
import aiohttp
import asyncio
async def async_fetch(session, url):
async with session.get(url) as response:
return await response.text()
async def async_main(urls):
async with aiohttp.ClientSession() as session:
tasks = [async_fetch(session, url) for url in urls]
return await asyncio.gather(*tasks)
# MongoDB存儲示例
from pymongo import MongoClient
class MusicDB:
def __init__(self):
self.client = MongoClient('mongodb://localhost:27017/')
self.db = self.client['music_db']
self.collection = self.db['kuwo_music']
def save_song(self, song_data):
return self.collection.update_one(
{'rid': song_data['rid']},
{'$set': song_data},
upsert=True
)
本文詳細介紹了如何使用Python爬取酷我音樂的數據,包括:
完整項目代碼已上傳至GitHub(示例地址)。希望本文能幫助你學習Python爬蟲開發,但請務必遵守法律法規,合理使用爬蟲技術。
聲明:本文所有代碼示例僅供學習參考,實際使用時請遵守酷我音樂的相關規定。過度爬取可能導致IP被封禁或承擔法律責任。 “`
這篇文章大約4100字,涵蓋了從基礎到進階的酷我音樂爬蟲實現方法,包含了代碼示例、技術分析和法律注意事項。如需調整內容或補充細節,可以進一步修改完善。
免責聲明:本站發布的內容(圖片、視頻和文字)以原創、轉載和分享為主,文章觀點不代表本網站立場,如果涉及侵權請聯系站長郵箱:is@yisu.com進行舉報,并提供相關證據,一經查實,將立刻刪除涉嫌侵權內容。