爬取网易云音乐

只能说困难重重

抓取歌单目录上各个歌曲的ip

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
#昨天的成果
# 尝试抓取一个网页信息,作为练习的素材,自己找起来太慢了
from bs4 import BeautifulSoup, Comment
from selenium import webdriver
import requests
import os
import re
import io
import sys
import urllib.request
#改变标准输出的默认编码
sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='gb18030')

#抓取网易云音乐歌曲 + 评论
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
'Referer': 'https://music.163.com/',
'Host': 'music.163.com'
}

#思路: 登录信息跳过直接网页保存到本地,再依次抓取评论,选取前100行,点赞或者评论超过200的?
path = r'./list.html' #网易云音乐的页面动态页面,ctrl+s只会保存主页
file = open(path, encoding='utf-8')
context = file.read()
file.close()
soup = BeautifulSoup(context, "html.parser")
links = [] #创建一个列表存储链接
titles = []
res =soup.select("tr")
for li in res:
title = li.find_all("b")
link = li.find_all("a")
try:
new_link = 'http://music.163.com'+link[0].attrs['href']
links.append(new_link)
except:
pass
for i in title:
titles.append(i.attrs['title'])
# print(links, '\n', titles)

url = links[0]
url = url.replace('/#', '').replace('https', 'http') # 对字符串进行去空格和转协议处理
print(url)
driver = webdriver.Chrome()
driver.get(url)
response = driver.page_source
html = BeautifulSoup(response, 'html.parser')
print(html)
# lyric = html.find(name = 'div', id="lyric-content")

res = requests.get(url=url, headers=headers).text
with open('./1.txt', 'w') as f1:
# re = res.encode('raw_unicode_escape').decode('unicode_escape')
print(res)
with open('./2.txt', 'w') as f2:
print(response)

#bs4具体教学
#https://blog.csdn.net/learner_syj/article/details/120590574

坑1:
只是用简单的requests,模拟浏览器操作的driver都是得不到网页的页面的,好像是异步操作导致的,必须要’动一下’。用浏览器打开随便一个音乐的页面,按F12打开控制台。
![屏幕截图 2023-12-24 134925](https://raw.githubusercontent.com/huanlue/image_info/main/img/blog/屏幕截图 2023-12-24 134925.png)

只能得到上边的html文档而下边的歌曲的主页面是没有的。即的到的html没有歌词,评论,啥也没有。

参考网络上的答复,只能选择调用接口,间接的得到歌曲相关的一系列东西,这个反爬虫是好烦的啊。![屏幕截图 2023-12-24 135555](https://raw.githubusercontent.com/huanlue/image_info/main/img/blog/屏幕截图 2023-12-24 140039.png)

在控制台上打开网络选项,按照参考文章的说法
(https://music.163.com/weapi/comment/resource/comments/get?csrf_token=61c471b2f6305c93e6288b165855e87f)尾缀token是有加密操作的,参数来源于params和encSeckey这两个参数又是需要解密的。[解密的话可以参考这个回答](如何爬网易云音乐的评论数? - 知乎用户A84ovo的回答 - 知乎
https://www.zhihu.com/question/36081767/answer/140287795)

![屏幕截图 2023-12-24 140039](D:\Desktop\爬取网易云评论.assets\屏幕截图 2023-12-24 140039.png)

[由于有没有加密的老api还在工作,先用着吧:](如何爬网易云音乐的评论数? - 肖飞的回答 - 知乎
https://www.zhihu.com/question/36081767/answer/310726622)

https://music.163.com/api/v1/resource/comments/R_SO_4_516997458
后边的’516997458’就是歌曲的id,id这个很好获取。
按照评论的说法,这个老api有评论数目的限制。https://music.163.com/api/v1/resource/comments/R_SO_4_516997458?offset=100
limit最多到一百好像,也有很多评论了。(https://music.163.com/api/v1/resource/comments/R_SO_4_516997458?limit=100)
所以还是采用offset翻页来吧。(https://music.163.com/api/v1/resource/comments/R_SO_4_516997458?offset=1)

关于歌曲MP3文件下载的api链接是:
http://music.163.com/api/song/lyric?id=
{}&lv=-1&kv=-1&tv=-1’.format(song_id)

关于歌词下载的地址调用的api格式:
http://music.163.com/song/media/outer/url?id=

下面就是之后的写代码了:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
# coding=utf-8
import io
import sys
import os
import re
import json
import requests
from lxml import etree
from bs4 import BeautifulSoup
import time
from datetime import datetime
sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='gb18030')

#获取音乐id列表,返回id,歌曲名,以及完整链接
def songs_lists(url=None):
if url is None:
url = 'https://music.163.com/#/playlist?id=2781114174'
url = url.replace('/#', '').replace('https', 'http') # 对字符串进行去空格和转协议处理
# 请求头
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
'Referer': 'https://music.163.com/',
'Host': 'music.163.com'
}
# 请求页面的源码
res = requests.get(url=url, headers=headers).text
out_link = 'http://music.163.com/song/media/outer/url?id='
tree = etree.HTML(res)
# 音乐列表
song_list = tree.xpath('//ul[@class="f-hide"]/li/a')
# 如果是歌手页面
artist_name_tree = tree.xpath('//h2[@id="artist-name"]/text()')
artist_name = str(artist_name_tree[0]) if artist_name_tree else None

# 如果是歌单页面:
#song_list_tree = tree.xpath('//*[@id="m-playlist"]/div[1]/div/div/div[2]/div[2]/div/div[1]/table/tbody')
song_list_name_tree = tree.xpath('//h2[contains(@class,"f-ff2")]/text()')
song_list_name = str(song_list_name_tree[0]) if song_list_name_tree else None

# 设置音乐下载的文件夹为歌手名字或歌单名
# folder = './' + artist_name if artist_name else './' + song_list_name

# if not os.path.exists(folder):
# os.mkdir(folder)
song_links =[]
song_ids = []
names = []
for i, s in enumerate(song_list):
href = str(s.xpath('./@href')[0])
song_id = href.split('=')[-1]
song_ids.append(song_id)
src = out_link + song_id # 拼接获取音乐真实的src资源值
song_links.append(src)
title = str(s.xpath('./text()')[0]) # 音乐的名字
names.append(title)
return [song_ids, names, song_links]
# filename = title + '.mp3'
# filepath = folder + '/' + filename
# print('开始下载第{}首音乐:{}\n'.format(i + 1, filename))

# try: # 下载音乐
# #下载歌词
# #download_lyric(title, song_id)
#
# data = requests.get(src).content # 音乐的二进制数据
#
# with open(filepath, 'wb') as f:
# f.write(data)
# except Exception as e:
# print(e)
# print('{}首全部歌曲已经下载完毕!'.format(len(song_list)))

def download_song(lists):
if url[2]:
print('开始下载音乐')
download_location = './download_music'
if not os.path.exists(download_location):
os.mkdir(download_location)
for n, song in enumerate(lists[2]):
print('正在下载第{}首歌曲!'.format(n) + ':' + lists[1][n])

name = download_location + '/' + lists[1][n] + '.mp3'
try:
data = requests.get(song).content
with open(name, 'wb') as f:
f.write(data)
except Exception as e:
print(e)
print('{}首全部歌曲已经下载完毕!'.format(len(lists[1])))

def download_comment(lists=None):
if lists[2]:
print('开始获取评论!')

headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
'Referer': 'https://music.163.com/',
'Host': 'music.163.com'
}

for n1, list in enumerate(lists[0]):
link = 'https://music.163.com/api/v1/resource/comments/R_SO_4_' + list
for P in range(1,20,2):
url = link + '?offset=' + str(P)
print(url)
res = requests.get(url=url, headers=headers).text
json_obj = json.loads(res)
# print(type(json_obj))
comments = json_obj['comments'] #这边注意数据结构,列表里面套字典,字典里面有列表
print('正在下载第{}首歌曲的评论!'.format(n1) + ':' + lists[1][n1])

download_location = './download_music'
if not os.path.exists(download_location):
os.mkdir(download_location)

for i, n in enumerate(comments):
# print(type(i),'\n',i)
comment = n['content']
nickname = n['user']['nickname']
time_update = n['time']
time = datetime.fromtimestamp(time_update//1000)
time = time.strftime('%Y-%m-%d %H:%M:%S') # 只取年月日,时分秒
name_comment = download_location + '/' + lists[1][n1] + '.txt'
try:
with open(name_comment, 'a+') as f:
data = ('\t' + nickname + ':' + '\t' + time + '\n' + comment+'\n')
f.write(data + '\n')
except Exception as e:
print(e)
print('{}首全部歌曲评论已经下载完毕!'.format(len(lists[1])))

def download_lyrics(lists):
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
'Referer': 'https://music.163.com/',
'Host': 'music.163.com'
# 'Origin': 'https://music.163.com'
}
song_name, song_id = lists[1], lists[0]
for i, id in enumerate(song_id):
url = 'http://music.163.com/api/song/lyric?id={}&lv=-1&kv=-1&tv=-1'.format(id)
print(url)
# 请求页面的源码
res = requests.get(url=url, headers=headers).text
json_obj = json.loads(res)
lyric = json_obj['lrc']['lyric']
reg = re.compile(r'\[.*\]')
lrc_text = re.sub(reg, '', lyric).strip()
# print(song_name, lrc_text)
download_location = './download_music'
if not os.path.exists(download_location):
os.mkdir(download_location)
name = download_location + '/' + song_name[i] + '.lrc'
try:
with open(name, 'w') as f:
data = lrc_text
f.write(data)
except Exception as e:
print(e)

# try again
if __name__ == '__main__':
print('start:\n')
url = 'https://music.163.com/#/playlist?id=2384642500'
lists = songs_lists(url)
# 下载歌曲
download_song(lists)
# 下载评论
download_comment(lists)
# 下载歌词:
download_lyrics(lists)
print('end!')

然后还是人工操作了第一步,自己去浏览前把歌单的html文档复制到了本地。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
# coding=utf-8
import io
import sys
import os
import re
import json
import requests
# import time
from lxml import etree
from bs4 import BeautifulSoup
from datetime import datetime
#把整个控制台的输出编码改成改成gb18030
sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='gb18030')

#获取音乐id列表,返回id,歌曲名,以及完整链接
def songs_lists(url=None):
if url is None:
url = 'https://music.163.com/#/playlist?id=2781114174'
url = url.replace('/#', '').replace('https', 'http') # 对字符串进行去空格和转协议处理
# 请求头
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
'Referer': 'https://music.163.com/',
'Host': 'music.163.com'
}
# 请求页面的源码
res = requests.get(url=url, headers=headers).text

# 网易于英语调用api入口形式
out_link = 'http://music.163.com/song/media/outer/url?id='

tree = etree.HTML(res)
# 音乐列表
song_list = tree.xpath('//ul[@class="f-hide"]/li/a')
# 如果是歌手页面
artist_name_tree = tree.xpath('//h2[@id="artist-name"]/text()')
artist_name = str(artist_name_tree[0]) if artist_name_tree else None

# 如果是歌单页面:
#song_list_tree = tree.xpath('//*[@id="m-playlist"]/div[1]/div/div/div[2]/div[2]/div/div[1]/table/tbody')
song_list_name_tree = tree.xpath('//h2[contains(@class,"f-ff2")]/text()')
song_list_name = str(song_list_name_tree[0]) if song_list_name_tree else None

# 给歌曲id号,歌曲名字,还有歌曲直链存放
song_ids = [],names = [],song_links =[]
for i, s in enumerate(song_list):
href = str(s.xpath('./@href')[0])

song_id = href.split('=')[-1]
song_ids.append(song_id)

src = out_link + song_id # 拼接获取音乐真实的src资源值
song_links.append(src)

title = str(s.xpath('./text()')[0]) # 音乐的名字
names.append(title)

return [song_ids, names, song_links]


def download_song(lists, download_location):
if lists[2]:
print('开始下载音乐')

# 如果文件夹不存在,创建一个
if not os.path.exists(download_location):
os.mkdir(download_location)

for n, song in enumerate(lists[2]):
print('正在下载第{}首歌曲!'.format(n) + ':' + lists[1][n])
name = download_location + '/' + lists[1][n] + '.mp3'
try:
data = requests.get(song).content
# 歌曲的保存形式是二进制,所以用‘wb’
with open(name, 'wb') as f:
f.write(data)
except Exception as e:
print(e)
print('{}首全部歌曲已经下载完毕!'.format(len(lists[1])))

def download_comment(lists=None, download_location =None,pages =10):
if lists[2]:
print('开始获取评论!')

headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
'Referer': 'https://music.163.com/',
'Host': 'music.163.com'
}

for n1, list in enumerate(lists[0]):
# 没有密令的comment保存形式
link = 'https://music.163.com/api/v1/resource/comments/R_SO_4_' + list
for l in range(1,pages,1):
url = link + '?offset=' + str(l)
print('评论链接来源:'+url)
res = requests.get(url=url, headers=headers).text
json_obj = json.loads(res)
# print(type(json_obj)) #查看一下返回的结构
comments = json_obj['comments'] #这边注意数据结构,列表里面套字典,字典里面有列表
print('正在下载第{}首歌曲的评论!'.format(n1) + ':' + lists[1][n1])

if not os.path.exists(download_location):
os.mkdir(download_location)

for i, n in enumerate(comments):
# comment是一个字典结构
comment = n['content']
nickname = n['user']['nickname']
time_update = n['time']
# 评论的时间是个时间戳的形式,后边还奇怪的多出了三位整数,直接整除了
time = datetime.fromtimestamp(time_update//1000)
time = time.strftime('%Y-%m-%d %H:%M:%S') # 只取年月日,时分秒
name_comment = download_location + '/' + lists[1][n1] + '.txt'
# 这边还可以用评论获得赞数来筛选的,没心情弄了
try:
with open(name_comment, 'a+') as f:
data = ('\t' + nickname + ':' + '\t' + time + '\n' + comment+'\n')
f.write(data + '\n')
except Exception as e:
print(e)
print('{}首全部歌曲评论的前{}页已经下载完毕!'.format(len(lists[1]),pages))

def download_lyrics(lists, download_location):
# 头信息
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
'Referer': 'https://music.163.com/',
'Host': 'music.163.com'
# 'Origin': 'https://music.163.com'
}
song_name, song_id = lists[1], lists[0]
for i, id in enumerate(song_id):
url = 'http://music.163.com/api/song/lyric?id={}&lv=-1&kv=-1&tv=-1'.format(id)
print(url)
# 请求页面的源码
try:
res = requests.get(url=url, headers=headers).text
json_obj = json.loads(res)
lyric = json_obj['lrc']['lyric']
reg = re.compile(r'\[.*\]')
lrc_text = re.sub(reg, '', lyric).strip()
except Exception as e:
print(e)
# print(song_name, lrc_text)
if not os.path.exists(download_location):
os.mkdir(download_location)
name = download_location + '/' + song_name[i] + '.lrc'
try:
with open(name, 'w') as f:
data = lrc_text.replace('\xa0','')
f.write(data)
except Exception as e:
print(e)

# 本地拥有html文件,获取信息的方法,和第一个差不多
def songs_lists_html(link):
# res = requests.get(url=url, headers=headers).text
path =link # 网易云音乐的页面动态页面,ctrl+s只会保存主页
file = open(path, encoding='utf-8')
context = file.read()
file.close()

context.encode('utf-8')

soup = BeautifulSoup(context, "html.parser")
song_links =[]
links = []
song_ids = []
names = []
res = soup.select("tr")
out_link = r'http://music.163.com/song/media/outer/url?id='
for li in res:
title = li.find_all("b")
link = li.find_all("a")
try:
new_link = r'http://music.163.com' + link[0].attrs['href']
song_links.append(new_link)
except:
pass
for i in title:
out = i.attrs['title'].replace('\xa0', '') #出现了很多\xa0,尝试去除
names.append(out)
for n in song_links:
link = n.replace('/song?', '/song/media/outer/url?').replace('https', 'http')
links.append(link)
id = n.replace('http://music.163.com/song?id=', '')
song_ids.append(id)
#http://music.163.com/song/media/outer/url?id = xxxx
return [song_ids, names, links]

# try again
if __name__ == '__main__':

url = 'https://music.163.com/#/playlist?id=2781114174'
# 本地链接
link = r'./mylist.html'
pages = 10
#获取id,name,url
# lists = songs_lists(url)
info = songs_lists_html(link)

names = info[1][0:10]
links = info[2][0:10]
ids = info[0][0:10]
lists = [ids, names, links]

#保存的地点
download_location = r'./cloudmusic'
# # 下载歌曲
download_song(lists,download_location)
# 下载评论(还有点问题,评论会重复还有有些编码问题还没解决)而且爬取过快会被服务器那边强制停止,time.sleep()
download_comment(lists,download_location,pages)
# 下载歌词:
download_lyrics(lists,download_location)


今天就到这吧。