按照这个方法,将其它想抓取的数据都取出来

ITFunSpider/spiders/cousrs.py

import scrapy
from ITFunSpider.items import ItfunspiderItem


class CourseSpider(scrapy.Spider):
    name = 'course'
    allowed_domains = ['itfun.tv']
    start_urls = ['https://itfun.tv/course_categories/front_end']

    def parse(self, response):
        for course in response.xpath('//div[@id="courseList"]/div/a'):
            item = ItfunspiderItem()

            # 课程信息节点
            info = course.xpath('./div[@class="info"]')

            # 标题
            item['title'] = info.xpath('./h3/text()').extract_first()

            # 日期,使用切片,去掉最后面的 "更新" 二字
            item['date'] = info.xpath('./p/span/text()').extract()[0][0:-2]

            # 标签。每个课程有多个 tag,所以爬到的是 list,要用 " " 合并成字符串
            tags = info.xpath('./div[@class="detail"]/p[@class="tags"]/em/text()').extract()
            item['tags'] = " ".join(tags)

            # 详情连接
            item['url'] = course.xpath('./@href').extract_first()

            # 图片。查看源码,发现网站使用了 lazy-load 加载图片,所以要找的是 data-src,而不是 src
            item['image'] = course.xpath('./div[@class="images"]/img/@data-src').extract_first()

            # 点赞数
            item['likes'] = course.xpath(
                './div[@class="reservation mark"]/div/div/button/div/span/text()').extract_first()

            yield item

ITFunSpider/pipelines.py

class ItfunspiderPipeline(object):
    def process_item(self, item, spider):
        print("标题:", item['title'])
        print("详情连接:", item['url'])
        print("日期:", item['date'])
        print("图片:", item['image'])
        print("点赞数:", item['likes'])
        print("标签:", item['tags'])

再次运行

$ scrapy crawl course

使用 CSS 选择器

如果不喜欢 Xpath,也可以使用 CSS 选择器的方式匹配数据

class CourseSpider(scrapy.Spider):
    name = 'course'
    allowed_domains = ['itfun.tv']
    start_urls = ['https://itfun.tv/course_categories/front_end']

    def parse(self, response):
        for course in response.css('div#courseList div.listItem a'):
            item = ItfunspiderItem()

            # 课程信息节点
            info = course.css('div.info')

            # 标题
            item['title'] = info.css('h3::text').extract_first()

            # 日期,使用切片,去掉最后面的 "更新" 二字
            item['date'] = info.css('p span:nth-child(1)::text').extract_first()[0:-2]

            # 标签。每个课程有多个 tag,所以爬到的是 list,要用 " " 合并成字符串
            tags = info.css('div.detail p.tags em::text').extract()
            item['tags'] = " ".join(tags)

            # 详情连接
            item['url'] = course.css('::attr(href)').extract_first()

            # 图片。查看源码,发现网站使用了 lazy-load 加载图片,所以要找的是 data-src,而不是 src
            item['image'] = course.css('div.images img::attr(data-src)').extract_first()

            # 点赞数
            item['likes'] = course.css('div.reservation.mark div div button div span::text').extract_first()

            yield item