爬虫:chatgpt自动产生百度新闻的代码


from bs4 import BeautifulSoup
import re

# 爬取百度新闻首页
url = "https://news.baidu.com/"
response = requests.get(url)
html = response.content
soup = BeautifulSoup(html, 'html.parser')

# 找到所有新闻文章链接
news_list = soup.find_all('a', href=re.compile('^http(s)?://(news.baidu.com)?/.*\d{10,}.*\.html$'))

# 循环遍历新闻链接并抓取标题、链接、时间和内容
for news in news_list:
try:
news_url = news['href']
news_response = requests.get(news_url)
news_html = news_response.content
news_soup = BeautifulSoup(news_html, 'html.parser')

# 获取文章标题
news_title = news_soup.find('title').text

# 获取文章发表时间
news_time = news_soup.find('span', class_='date').text

# 获取文章内容
news_content = ''
for p in news_soup.find_all('p'):
news_content += p.text.strip()

print(news_title)
print(news_url)
print(news_time)
print(news_content)
print('\n')

except Exception as e:
print(e)