scrapy汽车之家车型的简单爬取

代码漫游号
• 阅读 3549

汽车之家车型的简单爬取
spider

# -*- coding: utf-8 -*-
import scrapy
from scrapy import Request
from mininova.items import carItem
import sys
reload(sys)
sys.setdefaultencoding('utf8')
class SplashSpider(scrapy.Spider):
    #spider名字
    name = 'car_home'
    allowed_domains = ['autohome.com.cn']
    start_urls = [
    ]
     # 自定义配置
    custom_settings = {
         'ITEM_PIPELINES': {
         'mininova.pipelines.CarPipeline': 300,
         }
    }
    def start_requests(self): #重新定义起始爬取点
        #所有首字母
        words = ['A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z']
        #按照首字母,组合对应的页面,压入start_urls
        for word in words:
            self.start_urls.append('https://www.autohome.com.cn/grade/carhtml/'+word+'.html') 
        #根据start_urls,抓取页面
        for url in self.start_urls:
            yield Request(url,meta={'word':word})
    #定义默认的抓取函数
    def parse(self, response): 
        print('url')
        print(response.url)
        word = response.meta['word']
        car_cates = response.xpath('//dl').extract()
        brand_id = 0
        total_cars = []
        for brand_index in range(len(car_cates)):
            #品牌编号
            brand_num = brand_index + 1
            brand_num = str(brand_num)
            #品牌名
            brand = response.xpath('//dl['+brand_num+']/dt/div[1]/a/text()').extract()[0]
            print('brand:'+brand)
            #品牌logo
            brand_logo_url = response.xpath('//dl['+brand_num+']/dt//img[1]/@src').extract()[0]
            #品牌小类别
            brand_items = response.xpath('//dl['+brand_num+']/dd//div[@class="h3-tit"]/a/text()').extract()
            #品牌小类别对应的页面
            brand_item_urls = response.xpath('//dl['+brand_num+']/dd//div[@class="h3-tit"]/a/@href').extract()
            for brand_item_index in range(len(brand_items)):
                #品牌小类别的编号
                brand_item_num = brand_item_index + 1
                brand_item_num = str(brand_item_num)
                #品牌小类别名
                brand_item = brand_items[brand_item_index]
                #品牌小类别对应的页面的url
                brand_item_url = brand_item_urls[brand_item_index]
                print('brand_item:'+brand_item)
                print('brand_item_url:'+brand_item_url)
                #品牌小类别中的所有车
                cars = response.xpath('//dl['+brand_num+']/dd//ul[@class="rank-list-ul"]['+brand_item_num+']/li[@id]').extract()
                print('cars_count:'+str(len(cars)))
                for car_index in range(len(cars)):
                    car_num = car_index + 1
                    car_num = str(car_num)
                    #具体车的名称
                    name = response.xpath('//dl['+brand_num+']/dd//ul[@class="rank-list-ul"]['+brand_item_num+']/li[@id]['+car_num+']/h4/a/text()').extract()[0]
                    #车对应的页面
                    url = response.xpath('//dl['+brand_num+']/dd//ul[@class="rank-list-ul"]['+brand_item_num+']/li[@id]['+car_num+']/h4/a/@href').extract()[0]
                    #报价(最低价-最高价)
                    price = response.xpath('//dl['+brand_num+']/dd//ul[@class="rank-list-ul"]['+brand_item_num+']/li[@id]['+car_num+']/div[1]/a/text()').extract()[0]
                    prices = price.split('-')
                    price_base = '万'
                    if len(prices) != 2:
                        max_price = '暂无'
                        min_price = '暂无'
                    else:
                        max_price = str(prices[1].replace(price_base,''))
                        min_price = str(prices[0])
                    print('car:'+name+' max_price:'+str(max_price)+' min_price:'+str(min_price)+' price_base:'+price_base)
                    car_item = carItem()
                    car_item['name'] = name
                    car_item['url'] = url
                    car_item['brand_item'] = brand_item
                    car_item['first_word'] = word
                    car_item['brand'] = brand
                    car_item['brand_logo_url'] = brand_logo_url
                    car_item['max_price'] = max_price
                    car_item['min_price'] = min_price
                    total_cars.append(car_item)
        return total_cars

item

# -*- coding: utf-8 -*-
import scrapy
class carItem(scrapy.Item):
    #具体车名
    name = scrapy.Field()
    #对应的介绍页面url
    url = scrapy.Field()
    #最高报价,单位(万)
    max_price = scrapy.Field()
    #最低报价,单位(万)
    min_price = scrapy.Field()
    #品牌名
    brand = scrapy.Field()
    #品牌logo
    brand_logo_url = scrapy.Field()
    #品牌小类别名
    brand_item = scrapy.Field()
    #品牌首字母
    first_word = scrapy.Field() 

mongo_car

from mininova.mongodb import Mongo
from mininova.settings import mongo_setting
class MongoCar():
    db_name = 'car'
    brand_set_name = 'brand'
    brand_item_set_name = 'brand_item'
    car_set_name = 'car'
    def __init__(self):
        self.db = Mongo(mongo_setting['mongo_host'],mongo_setting['mongo_port'],mongo_setting['mongo_user'],mongo_setting['mongo_password'])

    def insert(self,item):
        brand_where = {'name':item['brand']}
        brand = self.brand_exist(self.db,brand_where)
        if brand == False:
            brand = {'name':item['brand'],'first_word':item['first_word']}
            brand = self.insert_brand(self.db,brand)
            print('brand insert ok!')
        else:
            brand = {'name':item['brand'],'first_word':item['first_word'],'logo_url':item['brand_logo_url']}
            brand = self.update_brand(self.db,brand_where,brand)
            print('brand_exist!')

        brand_item_where = {'name':item['brand_item']}
        brand_item = self.brand_item_exist(self.db,brand_item_where)
        if brand_item == False:
            brand_item = {'name':item['brand_item'],'first_word':item['first_word'],'brand_id':brand['_id']}
            brand_item = self.insert_brand_item(self.db,brand_item)
            print('brand_item insert ok!')
        else:
            print('brand_item_exist!')

        car_where = {'name':item['brand_item'],'name':item['name']}
        car = self.car_exist(self.db,car_where)
        if car == False:
            car = {'name':item['name'],'url':item['url'],'max_price':item['max_price'],'min_price':item['min_price'],'first_word':item['first_word'],'brand_id':brand['_id'],'brand_item_id':brand_item['_id']}
            car = self.insert_car(self.db,car)
            print('car insert ok!')
        else:
            print('car_exist!')
            


        if car != False:
            return True;
        else:
            return False;
    def update_brand(self,db,brand_where,brand):
        my_set = db.set(self.db_name,self.brand_set_name)
        my_set.update_one(brand_where,{'$set':brand})
        exist = my_set.find_one(brand_where)
        if(exist is None):
            return False
        else:
            return exist

    def brand_exist(self,db,brand):
        my_set = db.set(self.db_name,self.brand_set_name)
        exist = my_set.find_one(brand)
        if(exist is None):
            return False
        else:
            return exist

    def insert_brand(self,db,brand):
        my_set = db.set(self.db_name,self.brand_set_name)
        my_set.insert_one(brand)
        brand = my_set.find_one(brand)
        return brand

    def brand_item_exist(self,db,brand_item):
        my_set = db.set(self.db_name,self.brand_item_set_name)
        exist = my_set.find_one(brand_item)
        if(exist is None):
            return False
        else:
            return exist

    def insert_brand_item(self,db,brand_item):
        my_set = db.set(self.db_name,self.brand_item_set_name)
        my_set.insert_one(brand_item)
        brand = my_set.find_one(brand_item)
        return brand

    def car_exist(self,db,car):
        my_set = db.set(self.db_name,self.car_set_name)
        exist = my_set.find_one(car)
        if(exist is None):
            return False
        else:
            return exist

    def insert_car(self,db,car):
        my_set = db.set(self.db_name,self.car_set_name)
        my_set.insert_one(car)
        brand = my_set.find_one(car)
        return brand

pipeline

from mininova.settings import settings
import pymysql
import os
from mininova.db import Bookdb
from mininova.mongo_novel import MongoNovel
from mininova.mongo_car import MongoCar
import copy
class CarPipeline(object):   
    def process_item(self,item,spider):
        mongo_car = MongoCar()
        mongo_car.insert(item)
        print(item['name'])
        print('item insert ok!')

setting

mongo_setting = {
    'mongo_host' : 'xxx.xxx.xxx.xxx',
    'mongo_port' : 27017,
    'mongo_user' : 'username',
    'mongo_password' : 'password'
}
点赞
收藏
评论区
推荐文章
blmius blmius
4年前
MySQL:[Err] 1292 - Incorrect datetime value: ‘0000-00-00 00:00:00‘ for column ‘CREATE_TIME‘ at row 1
文章目录问题用navicat导入数据时,报错:原因这是因为当前的MySQL不支持datetime为0的情况。解决修改sql\mode:sql\mode:SQLMode定义了MySQL应支持的SQL语法、数据校验等,这样可以更容易地在不同的环境中使用MySQL。全局s
美凌格栋栋酱 美凌格栋栋酱
7个月前
Oracle 分组与拼接字符串同时使用
SELECTT.,ROWNUMIDFROM(SELECTT.EMPLID,T.NAME,T.BU,T.REALDEPART,T.FORMATDATE,SUM(T.S0)S0,MAX(UPDATETIME)CREATETIME,LISTAGG(TOCHAR(
待兔 待兔
1年前
手写Java HashMap源码
HashMap的使用教程HashMap的使用教程HashMap的使用教程HashMap的使用教程HashMap的使用教程22
python爬虫增加多线程获取数据
Python爬虫应用领域广泛,并且在数据爬取领域处于霸主位置,并且拥有很多性能好的框架,像Scrapy、Request、BeautifuSoap、urlib等框架可以实现爬行自如的功能,只要有能爬取的数据,Python爬虫均可实现。数据信息采集离不开Pyt
Stella981 Stella981
3年前
Crawlscrapy分布式爬虫
1.概念:多台机器上可以执行同一个爬虫程序,实现网站数据的分布爬取2.原生的scrapy是不可以实现分布式式爬虫  a)调度器无法共享  b)管道无法共享3.scrapyredis组件:专门为scrapy开发的一套组件,该组件可以让scrapy实现分布式  a)pipinstallscrapyredis4.分布式爬取的流程:
Stella981 Stella981
3年前
Scrapy框架
\TOC\1\.Scrapy介绍1.1.Scrapy框架Scrapy是用纯Python实现一个为了爬取网站数据、提取结构性数据而编写的应用框架,用途非常广泛。!(http://ccszt.com.cn/python/%E7%88%AC%E8%99%AB/file/images/
Stella981 Stella981
3年前
Scrapy爬虫框架
(1)、简介在糗事百科爬虫中我们的爬虫是继承scrapy.Spider类的,这也是基本的scrapy框架爬虫,在这个爬虫中我们自己在解析完整个页面后再获取下一页的url,然后重新发送了一个请求,而使用CrawlsSpider类可以帮助我们对url提出条件,只要满足这个条件,都进行爬取,CrawlSpider类继承自Spider,它比之前的Spid
Wesley13 Wesley13
3年前
(原创)Scrapy爬取美女图片续集
      上一篇咱们讲解了Scrapy的工作机制和如何使用Scrapy爬取美女图片,而今天接着讲解Scrapy爬取美女图片,不过采取了不同的方式和代码实现,对Scrapy的功能进行更深入的运用。!(https://oscimg.oschina.net/oscnet/495475f784c4eb6eadac4fb32e103808c26.jpg)
Wesley13 Wesley13
3年前
00:Java简单了解
浅谈Java之概述Java是SUN(StanfordUniversityNetwork),斯坦福大学网络公司)1995年推出的一门高级编程语言。Java是一种面向Internet的编程语言。随着Java技术在web方面的不断成熟,已经成为Web应用程序的首选开发语言。Java是简单易学,完全面向对象,安全可靠,与平台无关的编程语言。
Stella981 Stella981
3年前
Scrapy_redis
简介scrapy\_redis是一个基于Redis的Scrapy组件,用于scrapy项目的分布式部署和开发你可以启动多个spider对象,互相之间共享有一个redis的request队列,最适合多个域名的广泛内容的爬取特点分布式爬取分布式数据处理爬取到的item数据被推送到redis中,这意味着你可以启动尽可能多的item处理程序