Python爬取新浪微博及后续处理

  • 用cookie爬取m.weibo.cn上的数据

爬取列表数据

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
import requests

headers = {
'Cookie': '***',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36',
'X-Requested-With': 'XMLHttpRequest',
'x-xsrf-token': '2c1c0a'
}

request_url = 'https://m.weibo.cn/api/container/getIndex?containerid=100103type%3D1%26q%3D%E5%AD%99%E6%9D%A8%E5%9B%9E%E5%BA%94%E9%81%AD%E7%A6%81%E8%B5%9B8%E5%B9%B4&page_type=searchall&page=5'

response = requests.get(request_url, headers=headers)
response.encoding = 'utf-8'

# 保存文件时将unicode转成utf-8,则中文可直接显示
text = response.text.encode('utf-8').decode('unicode-escape')

with open('weibo.txt','w', encoding='utf-8') as f:
f.write(text)

微博返回的数据是json结构的,但是经过上述转码处理后,json的结构被破坏,不方便后续处理。所以正确的思路应该是在unicode编码时进行提取字段工作,然后再转成中文。令 text = response.text 即可。

提取列表数据

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
# -*- coding:utf-8 -*-
# coding:utf-8

import requests
import json

# 读取文件进行处理
from bs4 import BeautifulSoup

with open('weibo.txt', 'r') as f:
json_text = json.load(f)

cards = json_text['data']['cards']
for card in cards:
if 'mblog' in card:
mblog = card['mblog']
print('文章ID', mblog['id']) # https://m.weibo.cn/detail/4477074850969438
print('昵称', mblog['user']['screen_name'])
print('userID', mblog['user']['id']) # https://m.weibo.cn/u/5288553158
print('gender', mblog['user']['gender'])
print('粉丝', mblog['user']['followers_count'])
print('关注', mblog['user']['follow_count'])
if 'verified_reason' in mblog['user']:
print('微博认证', mblog['user']['verified_reason'])
print('微博会员', mblog['user']['mbrank'])
print('来自',mblog['source'])
print('转发', mblog['reposts_count'])
print('评论', mblog['comments_count'])
print('点赞', mblog['attitudes_count'])
if 'longText' in mblog:
longText = mblog['longText']
longTextContent = longText['longTextContent']
bs = BeautifulSoup(longTextContent, 'lxml')
print(bs.get_text())
else:
text = mblog['text']
bs = BeautifulSoup(text, 'lxml')
print(bs.get_text())

if 'retweeted_status' in mblog:
print('有转发行为')
text = mblog['retweeted_status']['text']
bs = BeautifulSoup(text, 'lxml')
print(bs.get_text())
print('ID', mblog['retweeted_status']['user']['id']) # https://m.weibo.cn/u/5288553158
print('gender', mblog['retweeted_status']['user']['gender'])
print('粉丝', mblog['retweeted_status']['user']['followers_count'])
print('关注', mblog['retweeted_status']['user']['follow_count'])
print()

爬取文章评论