简书爬虫

一个简书的爬虫,可以设定页码,抓取文章标题、简介以及链接 #coding=utf-8 import requests from bs4 import BeautifulSoup m=input("请输入想要抓取的页码数量:") for i in range(1,int(m)): url="https://www.jianshu.com/?page="+str(i) headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:62.0) Gecko/20100101 Firefox/62.0', 'Accept': 'text/html, */*; q=0.01', 'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2', 'Accept-Encoding': 'gzip, deflate', 'Referer': 'https://www.jianshu.com/', 'X-INFINITESCROLL': 'true', 'X-Requested-With': 'XMLHttpRequest', 'Connection': 'close', } html=requests.get(url=url,headers=headers) soup = BeautifulSoup(html.text.encode(html.encoding).decode('utf-8'), 'html.parser') # 以格式化的形式打印html #print(soup.prettify()) titles = soup.find_all('a', 'title') titlesp = soup.find_all('p', 'abstract') with open(r"./文章简介.txt","a",encoding='utf-8') as file: for (title,titlep) in zip(titles,titlesp): file.write(title.string+'\n') file.write(titlep.string+'\n') file.write("https://www.jianshu.com" + title.get('href')+'\n\n') print(“执行完毕,保存在目录:./文章简介.txt”)

  • Copyrights © 2018-2020 rich4rd
  • Visitors: | Views: