python crawler学习

爬取网易云音乐

只能说困难重重

抓取歌单目录上各个歌曲的ip

#昨天的成果
# 尝试抓取一个网页信息，作为练习的素材，自己找起来太慢了
from bs4 import BeautifulSoup, Comment
from selenium import webdriver
import requests
import os
import re
import io
import sys
import urllib.request
#改变标准输出的默认编码
sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='gb18030')

#抓取网易云音乐歌曲 + 评论
headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
        'Referer': 'https://music.163.com/',
        'Host': 'music.163.com'
    }

#思路: 登录信息跳过直接网页保存到本地，再依次抓取评论，选取前100行，点赞或者评论超过200的？
path = r'./list.html'   #网易云音乐的页面动态页面，ctrl+s只会保存主页
file = open(path, encoding='utf-8')
context = file.read()
file.close()
soup = BeautifulSoup(context, "html.parser")
links = [] #创建一个列表存储链接
titles = []
res =soup.select("tr")
for li in res:
    title = li.find_all("b")
    link = li.find_all("a")
    try:
        new_link = 'http://music.163.com'+link[0].attrs['href']
        links.append(new_link)
    except:
        pass
    for i in title:
        titles.append(i.attrs['title'])
# print(links, '\n', titles)

url = links[0]
url = url.replace('/#', '').replace('https', 'http')  # 对字符串进行去空格和转协议处理
print(url)
driver = webdriver.Chrome()
driver.get(url)
response = driver.page_source
html = BeautifulSoup(response, 'html.parser')
print(html)
# lyric = html.find(name = 'div', id="lyric-content")

res = requests.get(url=url, headers=headers).text
with open('./1.txt', 'w') as f1:
    # re = res.encode('raw_unicode_escape').decode('unicode_escape')
    print(res)
with open('./2.txt', 'w') as f2:
    print(response)

#bs4具体教学
#https://blog.csdn.net/learner_syj/article/details/120590574

坑1：
只是用简单的requests，模拟浏览器操作的driver都是得不到网页的页面的，好像是异步操作导致的，必须要’动一下’。用浏览器打开随便一个音乐的页面，按F12打开控制台。
![屏幕截图 2023-12-24 134925](https://raw.githubusercontent.com/huanlue/image_info/main/img/blog/屏幕截图 2023-12-24 134925.png)

只能得到上边的html文档而下边的歌曲的主页面是没有的。即的到的html没有歌词，评论，啥也没有。

参考网络上的答复，只能选择调用接口，间接的得到歌曲相关的一系列东西，这个反爬虫是好烦的啊。![屏幕截图 2023-12-24 135555](https://raw.githubusercontent.com/huanlue/image_info/main/img/blog/屏幕截图 2023-12-24 140039.png)

在控制台上打开网络选项，按照参考文章的说法
(https://music.163.com/weapi/comment/resource/comments/get?csrf_token=61c471b2f6305c93e6288b165855e87f)尾缀token是有加密操作的，参数来源于params和encSeckey这两个参数又是需要解密的。[解密的话可以参考这个回答](如何爬网易云音乐的评论数？ - 知乎用户A84ovo的回答 - 知乎
https://www.zhihu.com/question/36081767/answer/140287795)

![屏幕截图 2023-12-24 140039](D:\Desktop\爬取网易云评论.assets\屏幕截图 2023-12-24 140039.png)

[由于有没有加密的老api还在工作，先用着吧：](如何爬网易云音乐的评论数？ - 肖飞的回答 - 知乎
https://www.zhihu.com/question/36081767/answer/310726622)

https://music.163.com/api/v1/resource/comments/R_SO_4_516997458
后边的’516997458’就是歌曲的id，id这个很好获取。
按照评论的说法，这个老api有评论数目的限制。https://music.163.com/api/v1/resource/comments/R_SO_4_516997458?offset=100
limit最多到一百好像，也有很多评论了。(https://music.163.com/api/v1/resource/comments/R_SO_4_516997458?limit=100)
所以还是采用offset翻页来吧。(https://music.163.com/api/v1/resource/comments/R_SO_4_516997458?offset=1）

关于歌曲MP3文件下载的api链接是：
‘http://music.163.com/api/song/lyric?id=
{}&lv=-1&kv=-1&tv=-1’.format(song_id)

关于歌词下载的地址调用的api格式：
http://music.163.com/song/media/outer/url?id=

下面就是之后的写代码了：

# coding=utf-8
import io
import sys
import os
import re
import json
import requests
from lxml import etree
from bs4 import BeautifulSoup
import time
from datetime import datetime
sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='gb18030')

#获取音乐id列表,返回id，歌曲名，以及完整链接
def songs_lists(url=None):
    if url is None:
        url = 'https://music.163.com/#/playlist?id=2781114174'
    url = url.replace('/#', '').replace('https', 'http')  # 对字符串进行去空格和转协议处理
    # 请求头
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
        'Referer': 'https://music.163.com/',
        'Host': 'music.163.com'
    }
    # 请求页面的源码
    res = requests.get(url=url, headers=headers).text
    out_link = 'http://music.163.com/song/media/outer/url?id='
    tree = etree.HTML(res)
    # 音乐列表
    song_list = tree.xpath('//ul[@class="f-hide"]/li/a')
    # 如果是歌手页面
    artist_name_tree = tree.xpath('//h2[@id="artist-name"]/text()')
    artist_name = str(artist_name_tree[0]) if artist_name_tree else None

    # 如果是歌单页面：
    #song_list_tree = tree.xpath('//*[@id="m-playlist"]/div[1]/div/div/div[2]/div[2]/div/div[1]/table/tbody')
    song_list_name_tree = tree.xpath('//h2[contains(@class,"f-ff2")]/text()')
    song_list_name = str(song_list_name_tree[0]) if song_list_name_tree else None

    # 设置音乐下载的文件夹为歌手名字或歌单名
    # folder = './' + artist_name if artist_name else './' + song_list_name

    # if not os.path.exists(folder):
    #     os.mkdir(folder)
    song_links =[]
    song_ids = []
    names = []
    for i, s in enumerate(song_list):
        href = str(s.xpath('./@href')[0])
        song_id = href.split('=')[-1]
        song_ids.append(song_id)
        src = out_link + song_id  # 拼接获取音乐真实的src资源值
        song_links.append(src)
        title = str(s.xpath('./text()')[0])  # 音乐的名字
        names.append(title)
    return [song_ids, names, song_links]
        # filename = title + '.mp3'
        # filepath = folder + '/' + filename
        # print('开始下载第{}首音乐：{}\n'.format(i + 1, filename))

    #     try:  # 下载音乐
    #         #下载歌词
    #         #download_lyric(title, song_id)
    #
    #         data = requests.get(src).content  # 音乐的二进制数据
    #
    #         with open(filepath, 'wb') as f:
    #             f.write(data)
    #     except Exception as e:
    #         print(e)
    # print('{}首全部歌曲已经下载完毕！'.format(len(song_list)))

def download_song(lists):
    if url[2]:
        print('开始下载音乐')
    download_location = './download_music'
    if not os.path.exists(download_location):
        os.mkdir(download_location)
    for n, song in enumerate(lists[2]):
        print('正在下载第{}首歌曲！'.format(n) + ':' + lists[1][n])

        name  = download_location + '/' + lists[1][n] + '.mp3'
        try:
            data = requests.get(song).content
            with open(name, 'wb') as f:
                f.write(data)
        except Exception as e:
            print(e)
    print('{}首全部歌曲已经下载完毕！'.format(len(lists[1])))

def download_comment(lists=None):
    if lists[2]:
        print('开始获取评论！')

    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
        'Referer': 'https://music.163.com/',
        'Host': 'music.163.com'
    }

    for n1, list in enumerate(lists[0]):
        link = 'https://music.163.com/api/v1/resource/comments/R_SO_4_' + list
        for P in range(1,20,2):
            url = link + '?offset=' + str(P)
            print(url)
            res = requests.get(url=url, headers=headers).text
            json_obj = json.loads(res)
            # print(type(json_obj))
            comments = json_obj['comments'] #这边注意数据结构，列表里面套字典，字典里面有列表
            print('正在下载第{}首歌曲的评论！'.format(n1) + ':' + lists[1][n1])

            download_location = './download_music'
            if not os.path.exists(download_location):
                os.mkdir(download_location)

            for i, n in enumerate(comments):
                # print(type(i),'\n',i)
                comment = n['content']
                nickname = n['user']['nickname']
                time_update = n['time']
                time = datetime.fromtimestamp(time_update//1000)
                time = time.strftime('%Y-%m-%d %H:%M:%S')  # 只取年月日，时分秒
                name_comment = download_location + '/' + lists[1][n1] + '.txt'
                try:
                    with open(name_comment, 'a+') as f:
                        data = ('\t' + nickname + ':' + '\t' + time + '\n' +  comment+'\n')
                        f.write(data + '\n')
                except Exception as e:
                    print(e)
    print('{}首全部歌曲评论已经下载完毕！'.format(len(lists[1])))

def download_lyrics(lists):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
        'Referer': 'https://music.163.com/',
        'Host': 'music.163.com'
        # 'Origin': 'https://music.163.com'
    }
    song_name, song_id = lists[1], lists[0]
    for i, id in enumerate(song_id):
        url = 'http://music.163.com/api/song/lyric?id={}&lv=-1&kv=-1&tv=-1'.format(id)
        print(url)
        # 请求页面的源码
        res = requests.get(url=url, headers=headers).text
        json_obj = json.loads(res)
        lyric = json_obj['lrc']['lyric']
        reg = re.compile(r'\[.*\]')
        lrc_text = re.sub(reg, '', lyric).strip()
        # print(song_name, lrc_text)
        download_location = './download_music'
        if not os.path.exists(download_location):
            os.mkdir(download_location)
        name = download_location + '/' + song_name[i] + '.lrc'
        try:
            with open(name, 'w') as f:
                data = lrc_text
                f.write(data)
        except Exception as e:
            print(e)

# try again
if __name__ == '__main__':
    print('start:\n')
    url = 'https://music.163.com/#/playlist?id=2384642500'
    lists = songs_lists(url)
    # 下载歌曲
    download_song(lists)
    # 下载评论
    download_comment(lists)
    # 下载歌词：
    download_lyrics(lists)
    print('end!')

然后还是人工操作了第一步，自己去浏览前把歌单的html文档复制到了本地。

# coding=utf-8
import io
import sys
import os
import re
import json
import requests
# import time
from lxml import etree
from bs4 import BeautifulSoup
from datetime import datetime
#把整个控制台的输出编码改成改成gb18030
sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='gb18030')

#获取音乐id列表,返回id，歌曲名，以及完整链接
def songs_lists(url=None):
    if url is None:
        url = 'https://music.163.com/#/playlist?id=2781114174'
    url = url.replace('/#', '').replace('https', 'http')  # 对字符串进行去空格和转协议处理
    # 请求头
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
        'Referer': 'https://music.163.com/',
        'Host': 'music.163.com'
    }
    # 请求页面的源码
    res = requests.get(url=url, headers=headers).text

    # 网易于英语调用api入口形式
    out_link = 'http://music.163.com/song/media/outer/url?id='

    tree = etree.HTML(res)
    # 音乐列表
    song_list = tree.xpath('//ul[@class="f-hide"]/li/a')
    # 如果是歌手页面
    artist_name_tree = tree.xpath('//h2[@id="artist-name"]/text()')
    artist_name = str(artist_name_tree[0]) if artist_name_tree else None

    # 如果是歌单页面：
    #song_list_tree = tree.xpath('//*[@id="m-playlist"]/div[1]/div/div/div[2]/div[2]/div/div[1]/table/tbody')
    song_list_name_tree = tree.xpath('//h2[contains(@class,"f-ff2")]/text()')
    song_list_name = str(song_list_name_tree[0]) if song_list_name_tree else None

    # 给歌曲id号，歌曲名字，还有歌曲直链存放
    song_ids = [],names = [],song_links =[]
    for i, s in enumerate(song_list):
        href = str(s.xpath('./@href')[0])

        song_id = href.split('=')[-1]
        song_ids.append(song_id)

        src = out_link + song_id  # 拼接获取音乐真实的src资源值
        song_links.append(src)

        title = str(s.xpath('./text()')[0])  # 音乐的名字
        names.append(title)

    return [song_ids, names, song_links]


def download_song(lists, download_location):
    if lists[2]:
        print('开始下载音乐')

    # 如果文件夹不存在，创建一个
    if not os.path.exists(download_location):
        os.mkdir(download_location)

    for n, song in enumerate(lists[2]):
        print('正在下载第{}首歌曲！'.format(n) + ':' + lists[1][n])
        name  = download_location + '/' + lists[1][n] + '.mp3'
        try:
            data = requests.get(song).content
            # 歌曲的保存形式是二进制，所以用‘wb’
            with open(name, 'wb') as f:
                f.write(data)
        except Exception as e:
            print(e)
    print('{}首全部歌曲已经下载完毕！'.format(len(lists[1])))

def download_comment(lists=None, download_location =None,pages =10):
    if lists[2]:
        print('开始获取评论！')

    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
        'Referer': 'https://music.163.com/',
        'Host': 'music.163.com'
    }

    for n1, list in enumerate(lists[0]):
        # 没有密令的comment保存形式
        link = 'https://music.163.com/api/v1/resource/comments/R_SO_4_' + list
        for l in range(1,pages,1):
            url = link + '?offset=' + str(l)
            print('评论链接来源:'+url)
            res = requests.get(url=url, headers=headers).text
            json_obj = json.loads(res)
            # print(type(json_obj)) #查看一下返回的结构
            comments = json_obj['comments']      #这边注意数据结构，列表里面套字典，字典里面有列表
            print('正在下载第{}首歌曲的评论！'.format(n1) + ':' + lists[1][n1])

            if not os.path.exists(download_location):
                os.mkdir(download_location)

            for i, n in enumerate(comments):
                # comment是一个字典结构
                comment = n['content']
                nickname = n['user']['nickname']
                time_update = n['time']
                # 评论的时间是个时间戳的形式，后边还奇怪的多出了三位整数，直接整除了
                time = datetime.fromtimestamp(time_update//1000)
                time = time.strftime('%Y-%m-%d %H:%M:%S')  # 只取年月日，时分秒
                name_comment = download_location + '/' + lists[1][n1] + '.txt'
                # 这边还可以用评论获得赞数来筛选的，没心情弄了
                try:
                    with open(name_comment, 'a+') as f:
                        data = ('\t' + nickname + ':' + '\t' + time + '\n' +  comment+'\n')
                        f.write(data + '\n')
                except Exception as e:
                    print(e)
    print('{}首全部歌曲评论的前{}页已经下载完毕！'.format(len(lists[1]),pages))

def download_lyrics(lists, download_location):
    # 头信息
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
        'Referer': 'https://music.163.com/',
        'Host': 'music.163.com'
        # 'Origin': 'https://music.163.com'
    }
    song_name, song_id = lists[1], lists[0]
    for i, id in enumerate(song_id):
        url = 'http://music.163.com/api/song/lyric?id={}&lv=-1&kv=-1&tv=-1'.format(id)
        print(url)
        # 请求页面的源码
        try:
            res = requests.get(url=url, headers=headers).text
            json_obj = json.loads(res)
            lyric = json_obj['lrc']['lyric']
            reg = re.compile(r'\[.*\]')
            lrc_text = re.sub(reg, '', lyric).strip()
        except Exception as e:
            print(e)
        # print(song_name, lrc_text)
        if not os.path.exists(download_location):
            os.mkdir(download_location)
        name = download_location + '/' + song_name[i] + '.lrc'
        try:
            with open(name, 'w') as f:
                data = lrc_text.replace('\xa0','')
                f.write(data)
        except Exception as e:
            print(e)

# 本地拥有html文件，获取信息的方法，和第一个差不多
def songs_lists_html(link):
    # res = requests.get(url=url, headers=headers).text
    path =link  # 网易云音乐的页面动态页面，ctrl+s只会保存主页
    file = open(path, encoding='utf-8')
    context = file.read()
    file.close()

    context.encode('utf-8')

    soup = BeautifulSoup(context, "html.parser")
    song_links =[]
    links = []
    song_ids = []
    names = []
    res = soup.select("tr")
    out_link = r'http://music.163.com/song/media/outer/url?id='
    for li in res:
        title = li.find_all("b")
        link = li.find_all("a")
        try:
            new_link = r'http://music.163.com' + link[0].attrs['href']
            song_links.append(new_link)
        except:
            pass
        for i in title:
            out = i.attrs['title'].replace('\xa0', '') #出现了很多\xa0，尝试去除
            names.append(out)
    for n in song_links:
        link = n.replace('/song?', '/song/media/outer/url?').replace('https', 'http')
        links.append(link)
        id = n.replace('http://music.163.com/song?id=', '')
        song_ids.append(id)
        #http://music.163.com/song/media/outer/url?id = xxxx
    return [song_ids, names, links]

# try again
if __name__ == '__main__':

    url = 'https://music.163.com/#/playlist?id=2781114174'
    # 本地链接
    link = r'./mylist.html'
    pages = 10
    #获取id，name，url
    # lists = songs_lists(url)
    info = songs_lists_html(link)

    names = info[1][0:10]
    links = info[2][0:10]
    ids = info[0][0:10]
    lists = [ids, names, links]

    #保存的地点
    download_location = r'./cloudmusic'
    # # 下载歌曲
    download_song(lists,download_location)
    # 下载评论（还有点问题，评论会重复还有有些编码问题还没解决）而且爬取过快会被服务器那边强制停止，time.sleep()
    download_comment(lists,download_location,pages)
    # 下载歌词：
    download_lyrics(lists,download_location)

今天就到这吧。