爬虫南京几个学校贴吧的分享代码-简单教程(不能获利)

import requests
from lxml import etree
import time
import re
import json


# 数据采集
class data_spider:
def __init__(self):
self.headers = {
"Host": "tieba.baidu.com",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"Cookie": 'BAIDUID=7C5F80F92A43BB0552547127F614C0DB:FG=1; BAIDUID_BFESS=7C5F80F92A43BB0552547127F614C0DB:FG=1; BAIDU_WISE_UID=wapp_1716602884591_936; arialoadData=false; BDUSS=Y5cTQyRnJWd2xFOWdYM2E5LU5IRUcxUjhmbU9xcHZLSEltWlItaW5UflIxWGhtRVFBQUFBJCQAAAAAAQAAAAEAAAC-sX0tAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAANFIUWbRSFFmSU; BDUSS_BFESS=Y5cTQyRnJWd2xFOWdYM2E5LU5IRUcxUjhmbU9xcHZLSEltWlItaW5UflIxWGhtRVFBQUFBJCQAAAAAAQAAAAEAAAC-sX0tAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAANFIUWbRSFFmSU; STOKEN=de20c67d13fb68350918d959c4405cbf1f30263ddad073ecbba301ee2a8dbbfc; NO_UNAME=1; ZFY=RSPxEJ8pBnVBq99zgBD8UqjinNcfrwYa2UzW6VU4pj4:C; wise_device=0; Hm_lvt_98b9d8c2fd6608d564bf2ac2ae642948=1716602884,1716605018; USER_JUMP=-1; BA_HECTOR=218gag050k04048ga0aga08l5cn6f71j52k2t1u; st_key_id=17; Hm_lpvt_98b9d8c2fd6608d564bf2ac2ae642948=1716605070; XFI=b5aa7110-1a40-11ef-b36f-b564a56959c0; ab_sr=1.0.1_ZGQwNjQxMTAyZjRmMTYyOTYxNmM3ZmQ1YjE1NDg0YmEyZjM1NzA5ZDA3YmZjZDMxYzhhMjhiOGQ4NmUzNjJiODFkMjhkN2M1NWU4ZjhlMmVlNDY2ZjIxMGVlYTAyYWQ2Njg0YWEyYzYxYmFhMTllYTEwNTgzNzMyMTg1ZjU3OGExNmYzNjEyNTgwOGUzMjU2NmVmZGM5MWZjNmM2ZWQ4ZWJmNzJkN2Q1ZDYyZTBiZDQ0NjZhN2JmNjVmMGRiZWIx; st_data=194567ccfbef90a0256950e978fbc04a88934d65aa5ead5e6682a994abe9a2a0941db906192496d0561160a400df5fd4d549b5f44a5a9a6b89fe5c1088dd4b33dd344ccd12ae68ee8172c95f985c933b0aed0081021977f52b758c82e5ef9228d1161086553ce87a9b70d76b26262a61947390f097c38c9e73295f97c9f5213099a828bbe4b1a59cb4d11bcb888e0e74; st_sign=1beeff43; XFCS=76481F3588960D951014668C5E0D1B684675B8B7E112D4AEC7171321D0129BB5; XFT=oC5w917R6661E8PescegEfT78DgIyhOe/YOQCVDeqf8=; RT="z=1&dm=baidu.com&si=71088702-f617-4002-9329-6bf26e2fc498&ss=lwlh19sm&sl=16&tt=13ac&bcn=https%3A%2F%2Ffclog.baidu.com%2Flog%2Fweirwood%3Ftype%3Dperf&ld=1axaq&ul=1ecg7"',
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Edg/125.0.0.0"
}
self.tieba_items = []
self.saver = open('tieba_info.json', 'w', encoding='utf8')
self.tieba_name = None

# 采集百度贴吧的数据
def spider_tieba(self, tieba_name, base_url):
# 目标地址
self.tieba_name = tieba_name
self.spider_tieba_list(base_url)

# 时间转换
def get_time_convert(self, timeStr):
if (re.match(r'^\d{1,2}:\d{1,2}$', timeStr) != None):
day = time.strftime('%Y-%m-%d', time.localtime(time.time()))
timeStr = day + ' ' + timeStr + ':00'
elif (re.match(r'^\d{4}-\d{1,2}$', timeStr) != None):
day = time.strftime('%d', time.localtime(time.time()))
timeStr = timeStr + '-' + day + ' 00:00:00'
elif (re.match(r'^\d{1,2}-\d{1,2}$', timeStr) != None):
day = time.strftime('%Y', time.localtime(time.time()))
timeStr = day + '-' + timeStr + ' 00:00:00'
return timeStr

# 过滤表情
def filter_emoji(self, desstr, restr=''):
try:
co = re.compile(u'[\U00010000-\U0010ffff]')
except re.error:
co = re.compile(u'[\uD800-\uDBFF][\uDC00-\uDFFF]')
return co.sub(restr, desstr)

# 采集百度贴吧列表数据
def spider_tieba_list(self, url):
print(url)
time.sleep(2)
response = requests.get(url, headers=self.headers)
try:
response_txt = str(response.content, 'utf-8')
except Exception as e:
response_txt = str(response.content, 'gbk')
# response_txt = str(response.content,'utf-8')
bs64_str = re.findall(
r'<code class="pagelet_html" id="pagelet_html_frs-list/pagelet/thread_list" style="display:none;">[.\n\S\s]*?</code>',
response_txt)

bs64_str = ''.join(bs64_str).replace(
'<code class="pagelet_html" id="pagelet_html_frs-list/pagelet/thread_list" style="display:none;"><!--', '')
bs64_str = bs64_str.replace('--></code>', '')
html = etree.HTML(bs64_str)
# 标题列表
title_list = html.xpath('//div[@class="threadlist_title pull_left j_th_tit "]/a[1]/@title')
# 链接列表
link_list = html.xpath('//div[@class="threadlist_title pull_left j_th_tit "]/a[1]/@href')
# 发帖人
creator_list = html.xpath('//div[@class="threadlist_author pull_right"]/span[@class="tb_icon_author "]/@title')
# 发帖时间
create_time_list = html.xpath(
'//div[@class="threadlist_author pull_right"]/span[@class="pull-right is_show_create_time"]/text()')

for i in range(len(title_list)):
item = dict()
item['create_time'] = create_time_list[i]
if item['create_time'] == '广告':
continue
item['create_time'] = self.get_time_convert(item['create_time'])
item['title'] = self.filter_emoji(title_list[i])
item['link'] = 'https://tieba.baidu.com' + link_list[i]
if i < len(creator_list): # 添加检查
item['creator'] = self.filter_emoji(creator_list[i]).replace('主题作者: ', '')
else:
item['creator'] = '没名字' # 或者设置一个默认值
item['content'] = self.filter_emoji(item['title'])
item['school'] = self.tieba_name
self.tieba_items.append(item)
# 保存帖子数据
self.saver.writelines([json.dumps(item, ensure_ascii=False) + '\n' for item in self.tieba_items])
self.saver.flush()
self.tieba_items.clear()

# 如果有下一页继续采集下一页
nex_page = html.xpath('//a[@class="next pagination-item "]/@href')
if len(nex_page) > 0:
next_url = 'https:' + nex_page[0]

# 抓取 10000 条数据
if float(next_url.split('=')[-1]) < 2000:
try:
self.spider_tieba_list(next_url)
time.sleep(1)
except:
pass


if __name__ == "__main__":
data_spider = data_spider()

school_urls = {
'南京邮电大学': 'https://tieba.baidu.com/f?kw=南京邮电大学&ie=utf-8',
'南京财经大学': 'https://tieba.baidu.com/f?kw=南京财经大学&ie=utf-8',
'南京审计大学': 'https://tieba.baidu.com/f?kw=南京审计大学&ie=utf-8',
'南京信息职业技术学院': 'https://tieba.baidu.com/f?kw=南京信息职业技术学院&ie=utf-8',
'南京工业职业技术大学': 'https://tieba.baidu.com/f?kw=南京工业职业技术大学&ie=utf-8',
'南京理工大学紫金学院': 'https://tieba.baidu.com/f?kw=南京理工大学紫金学院&ie=utf-8',
}

for school in school_urls:
print("==> 抓取 {} 的贴吧数据".format(school))
data_spider.spider_tieba(school, school_urls[school])