喜欢逛B站的同学都知道,B站的最有意思就是“弹幕”,视频的播放量和弹幕数量基本上是成正比的。分析一个视频最好的方法就是看弹幕的情况,今天闲得无聊写一个爬弹幕的程序玩玩,后面可以搭配一些其他库进行词频分析,我这就只写爬取弹幕的部分了…

所需依赖库

1
2
3
4
# 请求库
requests
# 解析库
pyquery

资源页

1
2
3
4
# 获取视频cid
https://api.bilibili.com/x/player/pagelist?bvid=BV号
# 具体弹幕列表页
https://comment.bilibili.com/cid号.xml

具体代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

# 爬取哔哩哔哩BV号对应视频的弹幕
import requests
from pyquery import PyQuery as pq

def get_url_page(url):
header = {
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:100.0) Gecko/20100101 Firefox/100.0",
"Cookie": ""
}
response = requests.get(url, headers=header)
if response.status_code == 200:
response.encoding = response.apparent_encoding
return response
return None

def get_cid(bv):
cid_url = f"https://api.bilibili.com/x/player/pagelist?bvid={bv}"
cid_page = get_url_page(cid_url).json()

return cid_page["data"][0]["cid"]

def main():
bv_str = input("input the BV:")
danmaku_url = f"https://comment.bilibili.com/{get_cid(bv_str)}.xml"
danmaku_page = get_url_page(danmaku_url)
doc = pq(bytes(danmaku_page.text, encoding="utf-8"))
danmakus = doc("d").items()
with open("danmaku_list.txt", "a", encoding="utf-8") as f:
for danmaku in danmakus:
f.write(f"{danmaku.text()}\n")

if __name__ == "__main__":
main()