安装环境
Beautiful Soup
#打开cmd
pip install beautifulsoup4
requests
#打开cmd
pip install requests
pycharm安装环境
FIle菜单点击Settings,找到Python Interpreter
点击+号,搜索需要的库,下方Install Package
代码
GET方式抓取数据
import requests
url = 'https://liu66.cc/'
res = requests.get(url)
print(res.text)
POST方式抓取数据
import requests #导入requests包
import json
def get_translate_date(word=None):
url = 'http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule'
From_data={'i':word,'from':'zh-CHS','to':'en','smartresult':'dict','client':'fanyideskweb','salt':'15477056211258','sign':'b3589f32c38bc9e3876a570b8a992604','ts':'1547705621125','bv':'b33a2f3f9d09bde064c9275bcb33d94e','doctype':'json','version':'2.1','keyfrom':'fanyi.web','action':'FY_BY_REALTIME','typoResult':'false'}
response = requests.post(url,data=From_data) #请求表单数据
content = json.loads(response.text) #将Json格式字符串转字典
print(content['translateResult'][0][0]['tgt']) #打印翻译后的数据
word=input("输入中文:")
print('英文结果:',end="")
get_translate_date(word)
input('Press Enter to exit…')
Beautiful Soup解析网页
import requests #导入requests包
from bs4 import BeautifulSoup
url='https://blog.liustudy.xyz/archives'
strhtml = requests.get(url)
soup=BeautifulSoup(strhtml.text,'lxml') #指定lxml解析器进行解析
#复制网页解析数据的selector
data = soup.select('#Joe > div.joe_container.joe_main_container.page-archives.animated.showInUp > div > div > div.content > div.joe_archives__wrapper.animated.fadeIn > ul > li > div > ol > li > a')
#清洗数据
result=[]
for item in data:
result.append(item.get('title'))
for i in result:
print(i)
Selenium
下载安装包,注意selenium版本要与电脑浏览器版本一致
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from bs4 import BeautifulSoup
import time
url = 'https://blog.liustudy.xyz/'
driver = webdriver.Chrome("D:\study\chromedriver.exe")
driver.get(url)
driver.execute_script("window.scrollTo(0,document.body.scrollHeight);") #页面滚动到底部
driver.find_element(By.XPATH,r'//*[@id="Joe"]/div[1]/div/div[2]').click() #模拟鼠标点击查看更多按钮的xpath
time.sleep(5) #等待几秒让页面源代码数据更新
page_text = driver.page_source #获取源代码
soup=BeautifulSoup(page_text,'lxml')
#print(soup)
data = soup.select('#Joe > div.joe_container.joe_main_container.page-index.animated.showInUp > div > div.joe_index > div.joe_index__article > div.joe_index__list > ul.joe_list > li > div > a.title')
result=[]
for item in data:
result.append(item.get('title'))
for i in result:
print(i)
其他问题
爬取后数据乱码
print(res.encoding) #查询res的编码
#将res.text转换成gbk
r=res.text.encode('ISO-8859-1').decode('utf-8') #res编码为ISO-8859-1,网页charset为utf-8
print(r)
网页打开开发者工具,刷新网页后在Network界面找到headers菜单,往下拉找到User-Agent
headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'}
res = requests.get(url, headers=headers)
获取源代码不一致
requests获取到的是HTML源代码,但是网页用js渲染过,requests不会像浏览器那样运行后面请求来的js模块
评论区