scrapy学习案例_爬取读书网书籍信息

avatar
作者
猴君
阅读量:0

scrapy学习案例_爬取读书网书籍信息

此案例使用了crawlspider

1. 创建scrapy项目
scrapy startproject dushu 
2. 创建爬虫文件(../dushu/dushu/spiders)
scrapy genspider -t crawl read https://www.dushu.com/book/1188_1.html 

爬取思路

  1. 制定规则
rules = (Rule(LinkExtractor(allow=r"/book/1188_\d+\.html"), callback="parse_item", follow=True),) 
  1. 使用xpath解析response数据,找到每个数据最外层结构
img_list = response.xpath('//div[@class="bookslist"]//img') 
  1. 循环img_list再次使用xpath解析其中的数据(name, src)
name = img.xpath('./@data-original').extract_first() src = img.xpath('./@alt').extract_first() 
  1. 调用items.py里面的类, 把这个对象交给管道
from dushu.items import DushuItem  book = DushuItem(name=name, src=src)  yield book 
  1. 设置items
name = scrapy.Field() src = scrapy.Field() 
  1. 设置pipelines

将数据存储在MySQL数据库

  • 预先下载好MySQL

  • # 创建数据库(mysql>)  # 进入mysql>(终端) mysql -uroot -p<password>  # 创建数据库 create database spider01 charset=utf8;  use spider01;  create table book( 	id int primary key auto-increment, 	name varchar(128), 	src varchar(128) );  # 查看book select * from book 
  • # 文件导入(pipelines.py) from scrapy.utils.project import get_project_settings import pymysql 
  • # 配置常量(settings.py) DB_HOST = "127.0.0.1"	# 默认ip DB_PORT = 3306			# 注意,这里是整数 DB_USER = "root" DB_PASSWORD = "<密码>" DB_NAME = "spider01" DB_CHARSET = "utf8" 
  • # 添加下载到数据库的管道 class MysqlPipeline:      def open_spider(self, spider):         settings = get_project_settings()          self.host = settings['DB_HOST']         self.port = settings['DB_PORT']         self.user = settings['DB_USER']         self.password = settings['DB_PASSWORD']         self.name = settings['DB_NAME']         self.charset = settings['DB_CHARSET']          self.connect()      def connect(self):         self.conn = pymysql.connect(                 host=self.host,                 port=self.port,                 user=self.user,                 password=self.password,                 db=self.name,                 charset=self.charset         )         self.cursor = self.conn.cursor()      def process_item(self, item, spider):         sql = 'insert into book(name, src) values ("{}","{}")'.format(item['name'], item['src'])         self.cursor.execute(sql)         self.conn.commit()         return item      def close_spider(self, spider):         self.conn.close()         self.cursor.close()  
  • # 解封及添加(settings.py)  ITEM_PIPELINES = {    "dushu.pipelines.DushuPipeline": 300,    "dushu.pipelines.MysqlPipeline": 301, } 
3. 运行文件
scrapy crawl read 

代码

  • ../dushu/dushu/spiders/read.py
import scrapy from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule from dushu.items import DushuItem   class ReadSpider(CrawlSpider):     name = "read"     allowed_domains = ["www.dushu.com"]     start_urls = ["https://www.dushu.com/book/1188_1.html"]      rules = (Rule(LinkExtractor(allow=r"/book/1188_\d+\.html"), callback="parse_item", follow=True),)       def parse_item(self, response):          img_list = response.xpath('//div[@class="bookslist"]//img')          for img in img_list:             name = img.xpath('./@data-original').extract_first()             src = img.xpath('./@alt').extract_first()              book = DushuItem(name=name, src=src)              yield book 
  • ../dushu/dushu/items.py
# Define here the models for your scraped items # # See documentation in: # https://docs.scrapy.org/en/latest/topics/items.html  import scrapy   class DushuItem(scrapy.Item):     # define the fields for your item here like:     name = scrapy.Field()     src = scrapy.Field()  
  • ../dushu/dushu/pipelines.py
# Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html   # useful for handling different item types with a single interface from itemadapter import ItemAdapter   class DushuPipeline:     def open_spider(self, spider):         self.fp = open('book.json', 'w', encoding='utf-8')      def process_item(self, item, spider):         self.fp.write(str(item))         return item      def close_spider(self, spider):         self.fp.close()  from scrapy.utils.project import get_project_settings import pymysql  class MysqlPipeline:      def open_spider(self, spider):         settings = get_project_settings()          self.host = settings['DB_HOST']         self.port = settings['DB_PORT']         self.user = settings['DB_USER']         self.password = settings['DB_PASSWORD']         self.name = settings['DB_NAME']         self.charset = settings['DB_CHARSET']          self.connect()      def connect(self):         self.conn = pymysql.connect(                 host=self.host,                 port=self.port,                 user=self.user,                 password=self.password,                 db=self.name,                 charset=self.charset         )         self.cursor = self.conn.cursor()      def process_item(self, item, spider):         sql = 'insert into book(name, src) values ("{}","{}")'.format(item['name'], item['src'])         self.cursor.execute(sql)         self.conn.commit()         return item      def close_spider(self, spider):         self.conn.close()         self.cursor.close() 
  • ../dushu/dushu/settings.py
# Scrapy settings for dushu project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # #     https://docs.scrapy.org/en/latest/topics/settings.html #     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html #     https://docs.scrapy.org/en/latest/topics/spider-middleware.html  BOT_NAME = "dushu"  SPIDER_MODULES = ["dushu.spiders"] NEWSPIDER_MODULE = "dushu.spiders"   # Crawl responsibly by identifying yourself (and your website) on the user-agent #USER_AGENT = "dushu (+http://www.yourdomain.com)"  # Obey robots.txt rules ROBOTSTXT_OBEY = True  # Configure maximum concurrent requests performed by Scrapy (default: 16) #CONCURRENT_REQUESTS = 32  # Configure a delay for requests for the same website (default: 0) # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs #DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: #CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_IP = 16  # Disable cookies (enabled by default) #COOKIES_ENABLED = False  # Disable Telnet Console (enabled by default) #TELNETCONSOLE_ENABLED = False  # Override the default request headers: #DEFAULT_REQUEST_HEADERS = { #    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", #    "Accept-Language": "en", #}  # Enable or disable spider middlewares # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html #SPIDER_MIDDLEWARES = { #    "dushu.middlewares.DushuSpiderMiddleware": 543, #}  # Enable or disable downloader middlewares # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html #DOWNLOADER_MIDDLEWARES = { #    "dushu.middlewares.DushuDownloaderMiddleware": 543, #}  # Enable or disable extensions # See https://docs.scrapy.org/en/latest/topics/extensions.html #EXTENSIONS = { #    "scrapy.extensions.telnet.TelnetConsole": None, #}  DB_HOST = "127.0.0.1" DB_PORT = 3306 DB_USER = "root" DB_PASSWORD = "MySQL@4869" DB_NAME = "spider01" DB_CHARSET = "utf8"  # Configure item pipelines # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = {    "dushu.pipelines.DushuPipeline": 300,    "dushu.pipelines.MysqlPipeline": 301, }  # Enable and configure the AutoThrottle extension (disabled by default) # See https://docs.scrapy.org/en/latest/topics/autothrottle.html #AUTOTHROTTLE_ENABLED = True # The initial download delay #AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies #AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: #AUTOTHROTTLE_DEBUG = False  # Enable and configure HTTP caching (disabled by default) # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings #HTTPCACHE_ENABLED = True #HTTPCACHE_EXPIRATION_SECS = 0 #HTTPCACHE_DIR = "httpcache" #HTTPCACHE_IGNORE_HTTP_CODES = [] #HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"  # Set settings whose default value is deprecated to a future-proof value REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7" TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor" FEED_EXPORT_ENCODING = "utf-8" 

广告一刻

为您即时展示最新活动产品广告消息,让您随时掌握产品活动新动态!