源码:
https://github.com/Wist-fully/Attack/tree/pc
pc_p1
目标:
1.进入列表页,顺着列表爬取每个电影详情页
2.利用正则来提取,海报,名称,类别,上映的时间,评分,剧情简介
3.将爬取的内容,保存下来
逻辑:
1.遍历所有的页码,拼接URL
2.拿到详情页页面的url
3.在详情页面用正则匹配出我们要的内容
4.详情页面正则提取
5.保存数据+优化
首先导入需要的模块,设置日志
#!/usr/bin/env pythonimport logging
import requests
import re
from urllib.parse import urljoin
import pymongologging.basicConfig(level=logging.INFO,format='%(asctime)s - %(levelname)s - %(message)s')BASE_URL = 'https://ssr1.scrape.center'
TOTAL_PAGE = 10
核心代码
1.遍历所有的页码,拼接URL
#抓取某一页面的内容
def scrape_index(page):index_url = f'{BASE_URL}/page/{page}'return scrape_page(index_url)
2.拿到详情页页面的url
#定义一个函数抓取网页的内容
def scrape_page(url):logging.info("正在抓取 %s.....",url)#发起get请求try:response = requests.get(url)if response.status_code == 200:return response.textelse:logging.error("抓取 %s 时返回无效的状态码 %s",url,response.status_code)except requests.RequestException :#如果发生异常,就报错logging.error("抓取%s时发生异常",url,exc_info=True)
3.在详情页面用正则匹配出我们要的内容
def parse_index(html):#用正则把链接给提取出来# print(type(html))pattern = re.compile('<a.*href="(.*?)".*?class="name">')items = re.findall(pattern,html)# print(items)if not items:return []for item in items:#把相对链接转为绝对链接 detail_url = urljoin(BASE_URL,item)# print(detail_url)logging.info('找到详情页面了,链接%s',detail_url)yield detail_urldef main():for page in range(1,TOTAL_PAGE+1):index_html = scrape_index(page)detail_urls = parse_index(index_html)# print(list(detail_urls))logging.info('详细页面链接 %s', list(detail_urls))if __name__ == '__main__':main()
4.详情页面正则提取
分析:
1.图片是个img标签拿到url
2.类别是两个button里面的span标签
3.上映时间是一个div里的span标签
4.评分是p标签 source
5.剧情简介也是一个p标签,外面有个div drama
def parse_detail(html):#匹配图片的urlcover_pattern = re.compile('class="el-col.*?<img.*?src="(.*?)".*?class="cover">', re.S)# cover_pattern = re.compile(# '<img.*?src="(.*?)".*?class="cover">', re.S)#匹配电影名称name_pattern = re.compile('<h2.*?>(.*?)</h2>')#匹配类别categories_pattern = re.compile('<button.*?category.*?<span>(.*?)</span>.*?</button>', re.S)#匹配时间published_at_pattern = re.compile('(\d{4}-\d{2}-\d{2})\s?上映')#匹配剧情简介drama_pattern = re.compile('<div.*?drama.*?>.*?<p.*?>(.*?)</p>', re.S)#匹配评分score_pattern = re.compile('<p.*?score.*?>(.*?)</p>', re.S)cover = re.search(cover_pattern, html).group(1).strip() if re.search(cover_pattern, html) else Nonename = re.search(name_pattern, html).group(1).strip() if re.search(name_pattern, html) else Nonecategories = re.findall(categories_pattern, html) if re.findall(categories_pattern, html) else []published_at = re.search(published_at_pattern, html).group(1) if re.search(published_at_pattern, html) else Nonedrama = re.search(drama_pattern, html).group(1).strip() if re.search(drama_pattern, html) else Nonescore = float(re.search(score_pattern, html).group(1).strip()) if re.search(score_pattern, html) else None# print(type(cover))return {'cover': cover,'name': name,'categories': categories,'published_at': published_at,'drama': drama,'score': score}
5.保存数据+优化
#!/usr/bin/env pythonimport logging
import requests
import re
from urllib.parse import urljoin
import pymongo
import multiprocessingmongo_client = pymongo.MongoClient("mongodb://192.168.6.6:27017/")
db = mongo_client["my_movies"]
collection = db["movies"]logging.basicConfig(level=logging.INFO,format='%(asctime)s - %(levelname)s - %(message)s')BASE_URL = 'https://ssr1.scrape.center'
TOTAL_PAGE = 10
def save_data(data):collection.insert_one(data)logging.info("数据保存到mongodb成功!!!!")def main(page):# for page in range(1,TOTAL_PAGE+1):index_html = scrape_index(page)detail_urls = parse_index(index_html)for detail_url in detail_urls:detail_html = scrape_detail(detail_url)data = parse_detail(detail_html)logging.info('get detail data %s', data)save_data(data=data)logging.info('data saved successfully')def run_main(page):main(page)if __name__ == '__main__':# 获取CPU的核心数量num_process = multiprocessing.cpu_count()# 创建进程池pool = multiprocessing.Pool(num_process)# 要抓取的页面数量page_to_scrape = list(range(1, TOTAL_PAGE + 1))# 使用进程池运行pool.map(run_main, page_to_scrape)# 关闭进程池pool.close()
Python爬虫网安-logging模块日志管理-CSDN博客