目录 前言 1. requests库基础 2. HTTP请求方法 2.1 GET请求 2.2 POST请求 2.3 其他HTTP方法 3. 请求头设置 3.1 User-Agent设置 3.2 常用请求头 4. 响应处理 4.1 响应内容获取 4.2 响应状态码 4.3 响应头信息 5. 会话管理 5.1 Session对象 5.2 Cookie处理 6. 代理设置 7. 超时设置 8. SSL证书验证 9. 文件上传和下载 9.1 文件上传 10. 异常处理 11. 重试机制 12. 实际爬虫案例 13. 性能优化技巧 14. 反爬虫对策 14.1 随机User-Agent 14.2 IP代理池 15. 最佳实践 总结
前言
requests是Python中最受欢迎的HTTP库之一,它简化了HTTP请求的发送过程,是网络爬虫开发的首选工具。本文将详细介绍requests库在爬虫开发中的各种知识点和实用技巧。
1. requests库基础
1.1 安装requests
pip install requests
1.2 基本导入
import requests
2. HTTP请求方法
2.1 GET请求
import requests
response = requests. get( 'https://httpbin.org/get' )
print ( response. text)
params = { 'key1' : 'value1' , 'key2' : 'value2' }
response = requests. get( 'https://httpbin.org/get' , params= params)
print ( response. url)
2.2 POST请求
data = { 'username' : 'admin' , 'password' : '123456' }
response = requests. post( 'https://httpbin.org/post' , data= data)
import json
json_data = { 'name' : '张三' , 'age' : 25 }
response = requests. post( 'https://httpbin.org/post' , json= json_data)
response = requests. post( 'https://httpbin.org/post' , data= json. dumps( json_data) , headers= { 'Content-Type' : 'application/json' } )
2.3 其他HTTP方法
response = requests. put( 'https://httpbin.org/put' , data= { 'key' : 'value' } )
response = requests. delete( 'https://httpbin.org/delete' )
response = requests. head( 'https://httpbin.org/get' )
response = requests. options( 'https://httpbin.org/get' )
3. 请求头设置
3.1 User-Agent设置
headers = { 'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
response = requests. get( 'https://httpbin.org/headers' , headers= headers)
3.2 常用请求头
headers = { 'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' , 'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8' , 'Accept-Language' : 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3' , 'Accept-Encoding' : 'gzip, deflate' , 'Connection' : 'keep-alive' , 'Referer' : 'https://www.google.com/'
}
4. 响应处理
4.1 响应内容获取
response = requests. get( 'https://httpbin.org/get' )
print ( response. text)
print ( response. content)
if response. headers. get( 'content-type' ) == 'application/json' : json_data = response. json( ) print ( json_data)
print ( response. encoding)
response. encoding = 'utf-8'
4.2 响应状态码
response = requests. get( 'https://httpbin.org/get' )
print ( response. status_code)
if response. status_code == 200 : print ( '请求成功' )
else : print ( f'请求失败,状态码: { response. status_code} ' )
try : response. raise_for_status( )
except requests. exceptions. HTTPError as e: print ( f'HTTP错误: { e} ' )
4.3 响应头信息
response = requests. get( 'https://httpbin.org/get' )
print ( response. headers)
print ( response. headers[ 'Content-Type' ] )
print ( response. headers. get( 'Server' , '未知' ) )
5. 会话管理
5.1 Session对象
session = requests. Session( )
session. headers. update( { 'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
} )
response = session. get( 'https://httpbin.org/get' )
response = session. get( 'https://httpbin.org/cookies/set/sessioncookie/123456789' )
response = session. get( 'https://httpbin.org/cookies' )
print ( response. json( ) )
5.2 Cookie处理
cookies = { 'session_id' : 'abc123' , 'user_id' : '456' }
response = requests. get( 'https://httpbin.org/cookies' , cookies= cookies)
response = requests. get( 'https://httpbin.org/cookies/set/test/value' )
print ( response. cookies)
session = requests. Session( )
response = session. get( 'https://httpbin.org/cookies/set/auto/managed' )
response = session. get( 'https://httpbin.org/cookies' )
print ( response. json( ) )
6. 代理设置
6.1 HTTP代理
proxies = { 'http' : 'http://proxy.example.com:8080' , 'https' : 'https://proxy.example.com:8080'
} response = requests. get( 'https://httpbin.org/ip' , proxies= proxies)
6.2 SOCKS代理
proxies = { 'http' : 'socks5://127.0.0.1:1080' , 'https' : 'socks5://127.0.0.1:1080'
} response = requests. get( 'https://httpbin.org/ip' , proxies= proxies)
7. 超时设置
try : response = requests. get( 'https://httpbin.org/delay/5' , timeout= ( 3 , 10 ) )
except requests. exceptions. Timeout: print ( '请求超时' )
try : response = requests. get( 'https://httpbin.org/delay/5' , timeout= 5 )
except requests. exceptions. Timeout: print ( '请求超时' )
8. SSL证书验证
response = requests. get( 'https://httpbin.org/get' , verify= False )
response = requests. get( 'https://httpbin.org/get' , verify= '/path/to/ca-bundle.crt' )
response = requests. get( 'https://httpbin.org/get' , cert= ( '/path/to/client.cert' , '/path/to/client.key' ) )
9. 文件上传和下载
9.1 文件上传
with open ( 'test.txt' , 'rb' ) as f: files = { 'file' : f} response = requests. post( 'https://httpbin.org/post' , files= files)
files = { 'file1' : open ( 'file1.txt' , 'rb' ) , 'file2' : open ( 'file2.txt' , 'rb' )
}
response = requests. post( 'https://httpbin.org/post' , files= files)
for file in files. values( ) : file . close( )
9.2 文件下载
response = requests. get( 'https://httpbin.org/image/png' )
with open ( 'image.png' , 'wb' ) as f: f. write( response. content)
url = 'https://httpbin.org/stream-bytes/1024'
response = requests. get( url, stream= True )
with open ( 'large_file.bin' , 'wb' ) as f: for chunk in response. iter_content( chunk_size= 8192 ) : if chunk: f. write( chunk)
10. 异常处理
import requests
from requests. exceptions import RequestException, ConnectionError, Timeout, HTTPErrordef safe_request ( url, ** kwargs) : try : response = requests. get( url, ** kwargs) response. raise_for_status( ) return responseexcept ConnectionError: print ( '连接错误' ) except Timeout: print ( '请求超时' ) except HTTPError as e: print ( f'HTTP错误: { e} ' ) except RequestException as e: print ( f'请求异常: { e} ' ) return None
response = safe_request( 'https://httpbin.org/get' , timeout= 5 )
if response: print ( response. text)
11. 重试机制
from requests. adapters import HTTPAdapter
from requests. packages. urllib3. util. retry import Retrydef create_session_with_retry ( ) : session = requests. Session( ) retry_strategy = Retry( total= 3 , backoff_factor= 1 , status_forcelist= [ 429 , 500 , 502 , 503 , 504 ] , ) adapter = HTTPAdapter( max_retries= retry_strategy) session. mount( 'http://' , adapter) session. mount( 'https://' , adapter) return session
session = create_session_with_retry( )
response = session. get( 'https://httpbin.org/status/500' )
12. 实际爬虫案例
12.1 爬取网页内容
import requests
from bs4 import BeautifulSoup
import time
import randomclass WebScraper : def __init__ ( self) : self. session = requests. Session( ) self. session. headers. update( { 'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' } ) def get_page ( self, url, ** kwargs) : try : response = self. session. get( url, timeout= 10 , ** kwargs) response. raise_for_status( ) return responseexcept Exception as e: print ( f'获取页面失败: { e} ' ) return None def parse_html ( self, html_content) : soup = BeautifulSoup( html_content, 'html.parser' ) return soupdef crawl_with_delay ( self, urls) : results = [ ] for url in urls: response = self. get_page( url) if response: results. append( response. text) time. sleep( random. uniform( 1 , 3 ) ) return results
scraper = WebScraper( )
urls = [ 'https://httpbin.org/get' , 'https://httpbin.org/headers' ]
results = scraper. crawl_with_delay( urls)
12.2 处理表单登录
def login_example ( ) : session = requests. Session( ) login_page = session. get( 'https://example.com/login' ) soup = BeautifulSoup( login_page. text, 'html.parser' ) csrf_token = soup. find( 'input' , { 'name' : 'csrf_token' } ) [ 'value' ] login_data = { 'username' : 'your_username' , 'password' : 'your_password' , 'csrf_token' : csrf_token} response = session. post( 'https://example.com/login' , data= login_data) if '登录成功' in response. text or response. url == 'https://example.com/dashboard' : print ( '登录成功' ) return sessionelse : print ( '登录失败' ) return None
13. 性能优化技巧
13.1 连接池
from requests. adapters import HTTPAdaptersession = requests. Session( )
adapter = HTTPAdapter( pool_connections= 10 , pool_maxsize= 20 , max_retries= 3
) session. mount( 'http://' , adapter)
session. mount( 'https://' , adapter)
13.2 并发请求
import concurrent. futures
import requestsdef fetch_url ( url) : try : response = requests. get( url, timeout= 5 ) return response. status_code, len ( response. content) except Exception as e: return None , str ( e) urls = [ 'https://httpbin.org/get' ] * 10
with concurrent. futures. ThreadPoolExecutor( max_workers= 5 ) as executor: results = list ( executor. map ( fetch_url, urls) ) for i, ( status, size) in enumerate ( results) : print ( f'URL { i} : Status= { status} , Size= { size} ' )
14. 反爬虫对策
14.1 随机User-Agent
import randomuser_agents = [ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' , 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36' , 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36'
] def get_random_headers ( ) : return { 'User-Agent' : random. choice( user_agents) , 'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8' , 'Accept-Language' : 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3' , 'Accept-Encoding' : 'gzip, deflate' , 'Connection' : 'keep-alive' , }
response = requests. get( 'https://httpbin.org/headers' , headers= get_random_headers( ) )
14.2 IP代理池
import randomclass ProxyPool : def __init__ ( self) : self. proxies = [ { 'http' : 'http://proxy1:8080' , 'https' : 'https://proxy1:8080' } , { 'http' : 'http://proxy2:8080' , 'https' : 'https://proxy2:8080' } , ] def get_random_proxy ( self) : return random. choice( self. proxies) def request_with_proxy ( self, url, ** kwargs) : proxy = self. get_random_proxy( ) try : response = requests. get( url, proxies= proxy, timeout= 10 , ** kwargs) return responseexcept Exception as e: print ( f'代理请求失败: { e} ' ) return None
proxy_pool = ProxyPool( )
response = proxy_pool. request_with_proxy( 'https://httpbin.org/ip' )
15. 最佳实践
15.1 完整的爬虫框架
import requests
import time
import random
import logging
from urllib. parse import urljoin, urlparseclass AdvancedScraper : def __init__ ( self, base_url= None , delay_range= ( 1 , 3 ) ) : self. base_url = base_urlself. delay_range = delay_rangeself. session = self. _create_session( ) self. logger = self. _setup_logger( ) def _create_session ( self) : session = requests. Session( ) session. headers. update( { 'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' } ) return sessiondef _setup_logger ( self) : logger = logging. getLogger( __name__) logger. setLevel( logging. INFO) handler = logging. StreamHandler( ) formatter = logging. Formatter( '%(asctime)s - %(levelname)s - %(message)s' ) handler. setFormatter( formatter) logger. addHandler( handler) return loggerdef get ( self, url, ** kwargs) : if self. base_url and not url. startswith( 'http' ) : url = urljoin( self. base_url, url) try : self. logger. info( f'请求URL: { url} ' ) response = self. session. get( url, timeout= 10 , ** kwargs) response. raise_for_status( ) delay = random. uniform( * self. delay_range) time. sleep( delay) return responseexcept Exception as e: self. logger. error( f'请求失败: { url} , 错误: { e} ' ) return None def close ( self) : self. session. close( )
scraper = AdvancedScraper( base_url= 'https://httpbin.org' )
response = scraper. get( '/get' )
if response: print ( response. json( ) )
scraper. close( )
总结
requests库是Python爬虫开发的强大工具,掌握以下要点:
基础使用 :熟练掌握GET、POST等HTTP方法会话管理 :使用Session对象管理Cookie和连接异常处理 :完善的错误处理机制性能优化 :连接池、并发请求等技术反爬虫对策 :随机请求头、代理池、延时等最佳实践 :结构化代码、日志记录、资源管理
通过合理使用这些技巧,可以构建稳定、高效的网络爬虫程序。记住要遵守网站的robots.txt协议和相关法律法规,进行负责任的数据采集。