本文将会详细的写从零开始制作酷狗音乐爬虫的过程,可以下载酷狗音乐音乐付费歌曲,使用Python3开发。
随便打开一个歌单,右键检查属性发现这个<a>标签中含有data。

点进去播放,会发现:

看来,音乐外链和这个hash哈希值是脱不了关系的了。
事实上,酷狗音乐歌曲数据外链如下:
http://www.kugou.com/yy/index.php?r=play/getdata&hash=<刚才看到的哈希值>
填入刚刚看到的值(URL:https://www.kugou.com/yy/index.php?r=play/getdata&hash=FE2A4AC53981BB3FAF077CCA2461401A),会看到如下结果:
{
"status":1,
"err_code":0,
"data":{
"hash":"FE2A4AC53981BB3FAF077CCA2461401A",
"timelength":76000,
"filesize":1220066,
"audio_name":"Toby Fox - Snowdin Town",
"have_album":1,
"album_name":"UNDERTALE Soundtrack",
"album_id":"1983482",
"img":"http:\/\/imge.kugou.com\/stdmusic\/20170320\/20170320173006740587.jpg",
"have_mv":0,
"video_id":0,
"author_name":"Toby Fox",
"song_name":"Snowdin Town",
"lyrics":"\ufeff[id:$00000000]\r\n[ar:arkady sevidov]\r\n[ti:June]\r\n[by:]\r\n[hash:4399c9872c7235b60b58ce88dc487897]\r\n[al:]\r\n[sign:]\r\n[qq:]\r\n[total:320317]\r\n[offset:0]\r\n[00:01.58]\u7eaf\u97f3\u4e50\uff0c\u8bf7\u6b23\u8d4f\r\n",
"author_id":"579236",
"privilege":8,
"privilege2":"1000",
"play_url":"https:\/\/webfs.yun.kugou.com\/202002251957\/d2bff4d2209ca251954d38990445b8d2\/G057\/M08\/1A\/17\/GZQEAFaVAR6AYuQ_ABKd4lSFeUo118.mp3",
"authors":[
{
"author_id":"579236",
"sizable_avatar":"http:\/\/singerimg.kugou.com\/uploadpic\/softhead\/{size}\/20171223\/20171223024709440.jpg",
"is_publish":"1",
"author_name":"Toby Fox",
"avatar":"http:\/\/singerimg.kugou.com\/uploadpic\/softhead\/400\/20171223\/20171223024709440.jpg"
}
],
"is_free_part":0,
"bitrate":128,
"audio_id":"20519359",
"play_backup_url":"https:\/\/webfs.cloud.kugou.com\/202002251957\/5d639c7a92c6750b759cab7e718ede04\/G057\/M08\/1A\/17\/GZQEAFaVAR6AYuQ_ABKd4lSFeUo118.mp3"
}
}
我们打开这个play_url(https:\/\/webfs.yun.kugou.com\/202002251957\/d2bff4d2209ca251954d38990445b8d2\/G057\/M08\/1A\/17\/GZQEAFaVAR6AYuQ_ABKd4lSFeUo118.mp3),就会打开真实的音乐地址。
所以我们酷狗音乐获取真实下载链接的流程如下:
先尝试性获取hash:
#!/usr/bin/env python
#-*- coding:utf8 -*-
'''
@Author : Ray
@Contact : ray@raycoder.me
@Software: PyCharm
@File : KugouSpider.py
@Time : 2019/11/4 21:19
'''
import requests, re
from fake_useragent import *
from random import randint
class KugouSpider():
HEADER = { # 伪造请求头,防止被封IP
'Accept': '*/*',
'Accept-Encoding': 'gzip,deflate,sdch',
'Accept-Language': 'zh-CN,zh;q=0.8,gl;q=0.6,zh-TW;q=0.4',
'Connection': 'keep-alive',
'Content-Type': 'application/x-www-form-urlencoded',
'Referer': 'https://kugou.com/',
'User-Agent': UserAgent().random
}
def getArtUrl(self, artlist_url):
jsonUrl = 'http://www.kugou.com/yy/index.php?r=play/getdata&hash={0:s}'
response = requests.get(url = artlist_url, headers = self.HEADER)
songs = {}
#print(response.text)
for i in re.findall(r'<a title=".+" hidefocus="true" href="javascript:;" data="(\w+)\|\d+">', response.text):
#print(jsonUrl.format(i))
jsonfile = requests.get(url = jsonUrl.format(i), headers = self.HEADER, cookies = self.cookies)
jsondata = jsonfile.json()
#print(jsondata)
try:
songs[jsondata['data']['audio_name']] = jsondata['data']['play_url']
except:
pass
return songs
哎呀!最后返回了一个空字典!{}为什么呢?我观察后发现,酷狗需要添加一个cookie才可以访问到数据。
我们加一下:
def getUrl(self, artlist_url):
jsonUrl = 'http://www.kugou.com/yy/index.php?r=play/getdata&hash={0:s}'
response = requests.get(url = artlist_url, headers = self.HEADER)
songs = {}
for i in re.findall(r'<a title=".+" hidefocus="true" href="javascript:;" data="(\w+)\|\d+">', response.text):
jsonfile = requests.get(url = jsonUrl.format(i), headers = self.HEADER, cookies = {'kg_mid': '1'})
jsondata = jsonfile.json()
try:
songs[jsondata['data']['audio_name']] = jsondata['data']['play_url']
except:
pass
return songs
嗯?还是没有数据!我觉得可能是HEADER中的host的问题。
把它去掉!
class KugouSpider():
HEADER = {
'Accept': '*/*',
'Accept-Encoding': 'gzip,deflate,sdch',
'Accept-Language': 'zh-CN,zh;q=0.8,gl;q=0.6,zh-TW;q=0.4',
'Connection': 'keep-alive',
'Content-Type': 'application/x-www-form-urlencoded',
'Referer': 'https://kugou.com/',
'User-Agent': UserAgent().random
}
def getUrl(self, artlist_url):
jsonUrl = 'http://www.kugou.com/yy/index.php?r=play/getdata&hash={0:s}'
response = requests.get(url = artlist_url, headers = self.HEADER)
songs = {}
#print(response.text)
for i in re.findall(r'<a title=".+" hidefocus="true" href="javascript:;" data="(\w+)\|\d+">', response.text):
#print(jsonUrl.format(i))
jsonfile = requests.get(url = jsonUrl.format(i), headers = self.HEADER, cookies = {'kg_mid': '1'})
jsondata = jsonfile.json()
try:
songs[jsondata['data']['audio_name']] = jsondata['data']['play_url']
except:
pass
return songs
这样,我们就获得了真实的数据:
{'Toby Fox - Snowy': 'https://webfs.yun.kugou.com/202002280800/8b338c7c966f73984a16d332fe0a1905/G053/M04/0C/10/dQ0DAFaVAPCAdhQcABl6n5DvBc0140.mp3', 'Toby Fox - Undertale': 'https://webfs.yun.kugou.com/202002280800/91772c1108ab0627888c84f9fb6380f1/G057/M04/17/05/eQ0DAFaiNEWAXZVoAF0rmY34-tU840.mp3', 'Toby Fox - Tem Shop': 'https://webfs.yun.kugou.com/202002280800/02fb916f700f9df912d93daa77c9d420/G055/M0A/06/07/14YBAFaVAdyIb_RdAAsQ0yCL9vEAAAorgG2O-IACxDr193.mp3', 'Toby Fox - sans.': 'https://webfs.yun.kugou.com/202002280800/3580fcc155cd84b346acc9465504b01a/G052/M02/0C/07/1IYBAFaVAOWINLXsAAxhJsPWw2UAAAy4gIAOs0ADGE-807.mp3', 'Toby Fox - Dogsong': 'https://webfs.yun.kugou.com/202002280800/399c52d789e7f659001ab84e2bcfa823/G055/M00/06/11/F5QEAFaVARWIZFBMAAkgfyVHyJ4AAAoqAMZ4XYACSCX641.mp3'}
不对呀!为什么只找到了几首音乐?我们打印一下jsondata。
def getUrl(self, artlist_url):
jsonUrl = 'http://www.kugou.com/yy/index.php?r=play/getdata&hash={0:s}'
response = requests.get(url = artlist_url, headers = self.HEADER)
songs = {}
for i in re.findall(r'<a title=".+" hidefocus="true" href="javascript:;" data="(\w+)\|\d+">', response.text):
jsonfile = requests.get(url = jsonUrl.format(i), headers = self.HEADER, cookies = {'kg_mid': '1'})
jsondata = jsonfile.json()
print(jsondata)
try:
pass
#songs[jsondata['data']['audio_name']] = jsondata['data']['play_url']
except:
pass
return songs
Woc原来这个东西返回的是这种数据:
{'status': 0, 'err_code': 30020, 'data': {'SSA-CODE': 'bj_event_ee07e35318276c48c519523c26d9f4ce', 'SSA-HMID': '1'}}
我们随便打开一个hash源链:

也是这种情况。
我们再打开播放地址https://www.kugou.com/song/#hash=FE2A4AC53981BB3FAF077CCA2461401A&album_id=1983482:喔!它出现了一个验证码。这个东西就是罪魁祸首。
验证之后再打开hash源链,数据就出现了。
所以我们可以判断酷狗音乐这个cookie不止kg_mid一个。
我们打开浏览器控制台:

这里有恶心的cookie!
把它们复制到程序中去。
乃得:
class KugouSpider():
HEADER = { # 伪造请求头,防止被封IP
'Accept': '*/*',
'Accept-Encoding': 'gzip,deflate,sdch',
'Accept-Language': 'zh-CN,zh;q=0.8,gl;q=0.6,zh-TW;q=0.4',
'Connection': 'keep-alive',
'Content-Type': 'application/x-www-form-urlencoded',
'Referer': 'https://kugou.com/',
'User-Agent': UserAgent().random
}
cookies = {'kg_mid': '44c117b70a8abbe992b50ee90d19784f',
'kg_dfid': '2SoQgm4W6BHT0eH1EP3qAt3X',
'kg_dfid_collect': 'd41d8cd98f00b204e9800998ecf8427e',
'KuGooRandom': '66781582627339920',
'Hm_lvt_aedee6983d4cfc62f509129360d6bb3d 1582': '1582627291,1582630139,1582674521',
'kg_mid_temp': '44c117b70a8abbe992b50ee90d19784f',
'Hm_lpvt_aedee6983d4cfc62f509129360d6bb3d': '1582676297',
'ACK_SERVER_10016': '%7B%22list%22%3A%5B%5B%22gzreg-user.kugou.com%22%5D%5D%7D',
'ACK_SERVER_10015': '%7B%22list%22%3A%5B%5B%22bjlogin-user.kugou.com%22%5D%5D%7D',
'ACK_SERVER_10017': '%7B%22list%22%3A%5B%5B%22gzverifycode.service.kugou.com%22%5D%5D%7D'
}
def getUrl(self, artlist_url):
jsonUrl = 'http://www.kugou.com/yy/index.php?r=play/getdata&hash={0:s}'
response = requests.get(url = artlist_url, headers = self.HEADER, cookies = self.cookies)
songs = {}
for i in re.findall(r'<a title=".+" hidefocus="true" href="javascript:;" data="(\w+)\|\d+">', response.text):
jsonfile = requests.get(url = jsonUrl.format(i), headers = self.HEADER, cookies = self.cookies)
jsondata = jsonfile.json()
try:
songs[jsondata['data']['audio_name']] = jsondata['data']['play_url']
except:
pass
return songs
这个cookie是可以重复使用的,我们只需要在数据无法提取时去验证一下即可。
下载音乐可以去看我的这篇文章:
所以我们完成的程序如下:
#!/usr/bin/env python
#-*- coding:utf8 -*-
'''
@Author : Ray
@Contact : ray@raycoder.me
@Software: PyCharm
@File : KugouSpider.py
@Time : 2019/11/4 21:19
'''
import requests, re
from fake_useragent import *
from random import randint
class KugouSpider():
HEADER = { # 伪造请求头,防止被封IP
'Accept': '*/*',
'Accept-Encoding': 'gzip,deflate,sdch',
'Accept-Language': 'zh-CN,zh;q=0.8,gl;q=0.6,zh-TW;q=0.4',
'Connection': 'keep-alive',
'Content-Type': 'application/x-www-form-urlencoded',
'Referer': 'https://kugou.com/',
'User-Agent': UserAgent().random
}
cookies = {'kg_mid': 'd7d2db42653ccb5262c6b3a53d3e513e',
'kg_dfid': '4E1aaE0U1ZlH0aaWYz3Z7N6n',
'kg_dfid_collect': 'd41d8cd98f00b204e9800998ecf8427e',
'KuGooRandom': '66781582627339920',
'Hm_lvt_aedee6983d4cfc62f509129360d6bb3d 1582': '1583409383',
'kg_mid_temp': 'd7d2db42653ccb5262c6b3a53d3e513e',
'Hm_lpvt_aedee6983d4cfc62f509129360d6bb3d': '1583409803',
'ACK_SERVER_10016': '%7B%22list%22%3A%5B%5B%22gzreg-user.kugou.com%22%5D%5D%7D',
'ACK_SERVER_10015': '%7B%22list%22%3A%5B%5B%22bjlogin-user.kugou.com%22%5D%5D%7D',
'ACK_SERVER_10017': '%7B%22list%22%3A%5B%5B%22gzverifycode.service.kugou.com%22%5D%5D%7D'
}
def getArtUrl(self, artlist_url):
jsonUrl = 'http://www.kugou.com/yy/index.php?r=play/getdata&hash={0:s}'
response = requests.get(url = artlist_url, headers = self.HEADER)
songs = {}
#print(response.text)
for i in re.findall(r'<a title=".+" hidefocus="true" href="javascript:;" data="(\w+)\|\d+">', response.text):
#print(jsonUrl.format(i))
jsonfile = requests.get(url = jsonUrl.format(i), headers = self.HEADER, cookies = self.cookies)
jsondata = jsonfile.json()
#print(jsondata)
try:
songs[jsondata['data']['audio_name']] = jsondata['data']['play_url']
except:
pass
return songs
测试:
from KugouSpider import KugouSpider
from download import downloadMusic
def _test():
spider = KugouSpider()
downloadMusic(spider.getUrl('https://www.kugou.com/yy/special/single/1094256.html'))
if __name__ == '__main__':
_test()
这个也要写为一个单独的文件:test.py。
P.S.我发现貌似随机Cookie有效:
randten = lambda large: randint(1, 10**large)
cookies = {'kg_mid': str(randten(30)),
'kg_dfid': str(randten(30)),
'kg_dfid_collect': str(randten(30)),
'KuGooRandom': str(randten(12)),
'Hm_lvt_aedee6983d4cfc62f509129360d6bb3d 1582': str(randten(10)),
'kg_mid_temp': str(randten(30)),
'Hm_lpvt_aedee6983d4cfc62f509129360d6bb3d': str(randten(30)),
'ACK_SERVER_10016': str(randten(30)),
'ACK_SERVER_10015': str(randten(30)),
'ACK_SERVER_10017': str(randten(30))
}
可以试试。