记录一下python爬虫的学习过程

看着B站的视频学习一下爬虫，自己没事扒点小说下来看看也是不错的

先从简单的一步步来

跟着视频做了一个简单的从今日头条上获取街拍美女图的爬虫

一、获取url

首先，我们当然要先拿到我们想要请求的地址才行阿！
通过分析我们可以得到请求地址为https://www.toutiao.com/search_content/
请求参数为:

offset: 0
format: json
keyword: 街拍
autoload: true
count: 20
cur_tab: 3
from: gallery

其中offset参数动态改变。

二、分析返回数据

因为这个网页是通过动态数据请求(ajax)来加载数据的，所以直接通过分析html源代码无法得出
通过分析加载网页时的请求，我们拿到了上面那个请求地址，通过对上面的地址发起请求我们得到了数据格式为json的返回数据

下面是返回数据:

{
    "count": 20, 
    "action_label": "click_search", 
    "return_count": 20, 
    "has_more": 1, 
    "page_id": "/search/", 
    "request_id": "20180805175803172018076068601D75", 
    "cur_tab": 3, 
    "tokens": [
        "街拍"
    ], 
    "tab": {
        "tab_list": [
            {
                "tab_name": "综合", 
                "tab_id": 1, 
                "tab_code": "news"
            }, 
            {
                "tab_name": "视频", 
                "tab_id": 2, 
                "tab_code": "video"
            }, 
            {
                "tab_name": "图集", 
                "tab_id": 3, 
                "tab_code": "gallery"
            }, 
            {
                "tab_name": "用户", 
                "tab_id": 4, 
                "tab_code": "pgc"
            }, 
            {
                "tab_name": "问答", 
                "tab_id": 5, 
                "tab_code": "wenda"
            }
        ], 
        "cur_tab": 3
    }, 
    "offset": 20, 
    "action_label_web": "click_search", 
    "show_tabs": 1, 
    "data": [
        {
            "open_url": "sslocal://detail?aggr_type=0&article_type=0&gd_ext_json=%7B%22city%22%3A%22%22%2C%22log_pb%22%3A%7B%22impr_id%22%3A%2220180805175803172018076068601D75%22%7D%2C%22query%22%3A%22%E8%A1%97%E6%8B%8D%22%2C%22search_result_id%22%3A6586051680016531975%2C%22source%22%3A%22%E5%A4%A9%E5%A4%A9%E7%BE%8E%E5%A5%B3%E7%85%A7%22%7D&gd_label=click_search&groupid=6586051680016531975&item_id=6586051680016531975&search_id=20180805175803172018076068601D75", 
            "media_name": "天天美女照", 
            "show_play_effective_count": 0, 
            "media_url": "http://toutiao.com/m1601400804447236/", 
            "item_source_url": "/group/6586051680016531975/", 
            "labels": [ ], 
            "image_list": [
                {
                    "url": "//p3.pstatp.com/list/pgc-image/15334345790164a39d4cda3"
                }, 
                {
                    "url": "//p1.pstatp.com/list/pgc-image/1533434579202360e49e179"
                }, 
                {
                    "url": "//p3.pstatp.com/list/pgc-image/1533434579169b28fe5e60e"
                }, 
                {
                    "url": "//p3.pstatp.com/list/pgc-image/1533434579185ea991c570c"
                }
            ], 
            "media_avatar_url": "//p3.pstatp.com/medium/6eed0004f5928763adad", 
            "datetime": "2018-08-05 10:04:11", 
            "more_mode": false, 
            "create_time": "1533434651", 
            "has_gallery": true, 
            "id": "6586051680016531975", 
            "user_id": 6686213364, 
            "title": "街拍，简单时尚的穿搭，让夏天美不胜收", 
            "has_video": false, 
            "share_url": "http://toutiao.com/group/6586051680016531975/", 
            "source": "天天美女照", 
            "gallery_pic_count": 5, 
            "article_url": "http://toutiao.com/group/6586051680016531975/", 
            "comments_count": 0, 
            "large_mode": true, 
            "abstract": "", 
            "large_image_url": "http://p3.pstatp.com/large/pgc-image/15334345790164a39d4cda3", 
            "display_time": "1533434651", 
            "publish_time": "1533434651", 
            "middle_mode": false, 
            "gallary_image_count": 5, 
            "media_creator_id": 6686213364, 
            "tag_id": 6586051680016532000, 
            "source_url": "/group/6586051680016531975/", 
            "tag": "news_fashion", 
            "item_id": "6586051680016531975", 
            "user_auth_info": { }, 
            "seo_url": "/group/6586051680016531975/", 
            "keyword": "街拍", 
            "middle_image_url": "http://p3.pstatp.com/list/300x196/pgc-image/15334345790164a39d4cda3", 
            "behot_time": "1533434651", 
            "comment_count": 0, 
            "image_url": "//p9.pstatp.com/list/pgc-image/15334345790164a39d4cda3", 
            "has_image": true, 
            "highlight": {
                "source": [ ], 
                "abstract": [ ], 
                "title": [
                    [
                        0, 
                        2
                    ]
                ]
            }, 
            "group_id": "6586051680016531975", 
            "image_count": 5
        }，
        {
            "open_url": "sslocal://detail?aggr_type=0&article_type=0&gd_ext_json=%7B%22city%22%3A%22%22%2C%22log_pb%22%3A%7B%22impr_id%22%3A%2220180805175803172018076068601D75%22%7D%2C%22query%22%3A%22%E8%A1%97%E6%8B%8D%22%2C%22search_result_id%22%3A6586115603117900295%2C%22source%22%3A%22%E6%98%9F%E6%96%87%E5%A8%B1%E5%9C%88%22%7D&gd_label=click_search&groupid=6586115603117900295&item_id=6586115603117900295&search_id=20180805175803172018076068601D75", 
            "media_name": "星文娱圈", 
            "show_play_effective_count": 0, 
            "media_url": "http://toutiao.com/m1559497870734338/", 
            "item_source_url": "/group/6586115603117900295/", 
            "labels": [ ], 
            "image_list": [
                {
                    "url": "//p3.pstatp.com/list/pgc-image/1533449531156c9dab9a319"
                }, 
                {
                    "url": "//p3.pstatp.com/list/pgc-image/153344953242934a72b84b8"
                }, 
                {
                    "url": "//p3.pstatp.com/list/pgc-image/15334495329977486219465"
                }, 
                {
                    "url": "//p3.pstatp.com/list/pgc-image/153344953341216845e8bcc"
                }
            ], 
            "media_avatar_url": "//p1.pstatp.com/medium/47070000b68d3d5aa66d", 
            "datetime": "2018-08-05 14:12:15", 
            "more_mode": false, 
            "create_time": "1533449535", 
            "has_gallery": true, 
            "id": "6586115603117900295", 
            "user_id": 55831953939, 
            "title": "街拍路人，简约百搭不挑人的穿搭参考，让你轻松穿出女神范", 
            "has_video": false, 
            "share_url": "http://toutiao.com/group/6586115603117900295/", 
            "source": "星文娱圈", 
            "gallery_pic_count": 7, 
            "article_url": "http://toutiao.com/group/6586115603117900295/", 
            "comments_count": 1, 
            "large_mode": true, 
            "abstract": "", 
            "large_image_url": "http://p3.pstatp.com/large/pgc-image/1533449531156c9dab9a319", 
            "display_time": "1533449535", 
            "publish_time": "1533449535", 
            "middle_mode": false, 
            "gallary_image_count": 7, 
            "media_creator_id": 55831953939, 
            "tag_id": 6586115603117901000, 
            "source_url": "/group/6586115603117900295/", 
            "tag": "news_fashion", 
            "item_id": "6586115603117900295", 
            "user_auth_info": { }, 
            "seo_url": "/group/6586115603117900295/", 
            "keyword": "街拍", 
            "middle_image_url": "http://p9.pstatp.com/list/300x196/pgc-image/1533449531156c9dab9a319", 
            "behot_time": "1533449535", 
            "comment_count": 1, 
            "image_url": "//p1.pstatp.com/list/pgc-image/1533449531156c9dab9a319", 
            "has_image": true, 
            "highlight": {
                "source": [ ], 
                "abstract": [ ], 
                "title": [
                    [
                        0, 
                        2
                    ]
                ]
            }, 
            "group_id": "6586115603117900295", 
            "image_count": 7
        }
    ], 
    "message": "success", 
    "action_label_pgc": "click_search"
}

下面我们对数据进行分析

我们要取的数据是data:[]中的数据中的article_url项的值，因为是json格式的数据，我们使用json.loads()方法加载并提取其中的article_url项的值

得到article_url项的值之后，请求这个url中的内容，得到返回的html
下面是返回的html中包含数据的一部分

{
    "count": 7, 
    "sub_images": [
        {
            "url": "http://p1.pstatp.com/origin/pgc-image/1533449531156c9dab9a319", 
            "width": 507, 
            "url_list": [
                {
                    "url": "http://p1.pstatp.com/origin/pgc-image/1533449531156c9dab9a319"
                }, 
                {
                    "url": "http://pb3.pstatp.com/origin/pgc-image/1533449531156c9dab9a319"
                }, 
                {
                    "url": "http://pb9.pstatp.com/origin/pgc-image/1533449531156c9dab9a319"
                }
            ], 
            "uri": "origin/pgc-image/1533449531156c9dab9a319", 
            "height": 800
        }, 
        {
            "url": "http://p9.pstatp.com/origin/pgc-image/153344953242934a72b84b8", 
            "width": 640, 
            "url_list": [
                {
                    "url": "http://p9.pstatp.com/origin/pgc-image/153344953242934a72b84b8"
                }, 
                {
                    "url": "http://pb1.pstatp.com/origin/pgc-image/153344953242934a72b84b8"
                }, 
                {
                    "url": "http://pb3.pstatp.com/origin/pgc-image/153344953242934a72b84b8"
                }
            ], 
            "uri": "origin/pgc-image/153344953242934a72b84b8", 
            "height": 1024
        }, 
        {
            "url": "http://p9.pstatp.com/origin/pgc-image/15334495329977486219465", 
            "width": 640, 
            "url_list": [
                {
                    "url": "http://p9.pstatp.com/origin/pgc-image/15334495329977486219465"
                }, 
                {
                    "url": "http://pb1.pstatp.com/origin/pgc-image/15334495329977486219465"
                }, 
                {
                    "url": "http://pb3.pstatp.com/origin/pgc-image/15334495329977486219465"
                }
            ], 
            "uri": "origin/pgc-image/15334495329977486219465", 
            "height": 1024
        }, 
        {
            "url": "http://p1.pstatp.com/origin/pgc-image/153344953341216845e8bcc", 
            "width": 511, 
            "url_list": [
                {
                    "url": "http://p1.pstatp.com/origin/pgc-image/153344953341216845e8bcc"
                }, 
                {
                    "url": "http://pb3.pstatp.com/origin/pgc-image/153344953341216845e8bcc"
                }, 
                {
                    "url": "http://pb9.pstatp.com/origin/pgc-image/153344953341216845e8bcc"
                }
            ], 
            "uri": "origin/pgc-image/153344953341216845e8bcc", 
            "height": 800
        }, 
        {
            "url": "http://p1.pstatp.com/origin/pgc-image/15334495337171dfbe2e31d", 
            "width": 640, 
            "url_list": [
                {
                    "url": "http://p1.pstatp.com/origin/pgc-image/15334495337171dfbe2e31d"
                }, 
                {
                    "url": "http://pb3.pstatp.com/origin/pgc-image/15334495337171dfbe2e31d"
                }, 
                {
                    "url": "http://pb9.pstatp.com/origin/pgc-image/15334495337171dfbe2e31d"
                }
            ], 
            "uri": "origin/pgc-image/15334495337171dfbe2e31d", 
            "height": 1024
        }, 
        {
            "url": "http://p3.pstatp.com/origin/pgc-image/15334495342594477e81223", 
            "width": 690, 
            "url_list": [
                {
                    "url": "http://p3.pstatp.com/origin/pgc-image/15334495342594477e81223"
                }, 
                {
                    "url": "http://pb9.pstatp.com/origin/pgc-image/15334495342594477e81223"
                }, 
                {
                    "url": "http://pb1.pstatp.com/origin/pgc-image/15334495342594477e81223"
                }
            ], 
            "uri": "origin/pgc-image/15334495342594477e81223", 
            "height": 1065
        }, 
        {
            "url": "http://p3.pstatp.com/origin/pgc-image/15334495349295abe4d034f", 
            "width": 690, 
            "url_list": [
                {
                    "url": "http://p3.pstatp.com/origin/pgc-image/15334495349295abe4d034f"
                }, 
                {
                    "url": "http://pb9.pstatp.com/origin/pgc-image/15334495349295abe4d034f"
                }, 
                {
                    "url": "http://pb1.pstatp.com/origin/pgc-image/15334495349295abe4d034f"
                }
            ], 
            "uri": "origin/pgc-image/15334495349295abe4d034f", 
            "height": 1227
        }
    ], 
    "max_img_width": 690, 
    "labels": [
        "u65f6u88c5u642du914d"
    ], 
    "sub_abstracts": [
        "u8857u62cdu8defu4ebauff0cu7b80u7ea6u767eu642du4e0du6311u4ebau7684u7a7fu642du53c2u8003uff0cu8ba9u4f60u8f7bu677eu7a7fu51fau5973u795eu8303", 
        " ", 
        "u8857u62cdu8defu4ebauff0cu7b80u7ea6u767eu642du4e0du6311u4ebau7684u7a7fu642du53c2u8003uff0cu8ba9u4f60u8f7bu677eu7a7fu51fau5973u795eu8303", 
        " ", 
        " ", 
        " ", 
        "u8857u62cdu8defu4ebauff0cu7b80u7ea6u767eu642du4e0du6311u4ebau7684u7a7fu642du53c2u8003uff0cu8ba9u4f60u8f7bu677eu7a7fu51fau5973u795eu8303"
    ], 
    "sub_titles": [
        "u8857u62cdu8defu4ebauff0cu7b80u7ea6u767eu642du4e0du6311u4ebau7684u7a7fu642du53c2u8003uff0cu8ba9u4f60u8f7bu677eu7a7fu51fau5973u795eu8303", 
        "u8857u62cdu8defu4ebauff0cu7b80u7ea6u767eu642du4e0du6311u4ebau7684u7a7fu642du53c2u8003uff0cu8ba9u4f60u8f7bu677eu7a7fu51fau5973u795eu8303", 
        "u8857u62cdu8defu4ebauff0cu7b80u7ea6u767eu642du4e0du6311u4ebau7684u7a7fu642du53c2u8003uff0cu8ba9u4f60u8f7bu677eu7a7fu51fau5973u795eu8303", 
        "u8857u62cdu8defu4ebauff0cu7b80u7ea6u767eu642du4e0du6311u4ebau7684u7a7fu642du53c2u8003uff0cu8ba9u4f60u8f7bu677eu7a7fu51fau5973u795eu8303", 
        "u8857u62cdu8defu4ebauff0cu7b80u7ea6u767eu642du4e0du6311u4ebau7684u7a7fu642du53c2u8003uff0cu8ba9u4f60u8f7bu677eu7a7fu51fau5973u795eu8303", 
        "u8857u62cdu8defu4ebauff0cu7b80u7ea6u767eu642du4e0du6311u4ebau7684u7a7fu642du53c2u8003uff0cu8ba9u4f60u8f7bu677eu7a7fu51fau5973u795eu8303", 
        "u8857u62cdu8defu4ebauff0cu7b80u7ea6u767eu642du4e0du6311u4ebau7684u7a7fu642du53c2u8003uff0cu8ba9u4f60u8f7bu677eu7a7fu51fau5973u795eu8303"
    ]
}

通过分析可以发现，文章中的图片地址全部写在了javascipt的代码里面，我们需要的是其中数据中的sub_images:[]项数据中的url:项
我们可以利用正则表达式从返回的html数据中提取出gallery: JSON.parse()包含的json数据，但是还有一个问题就是里面包含有很多的转义字符
可以利用str.replace('\\','')函数将json数据中的转义字符去除，去除转义字符之后就可以用json.loads()方法将json数据转成易处理的数据了
之后我们就可以将我们需要的图片url提取出来了
有了图片的url之后，自然想要下载图片就很轻松了，你可以选择下载到文件中或是存到数据库中

三、开始撸代码

1.拼装url，得到图集数据

offset代表偏移量，当你需要获取其他数据时就可以改变这个偏移量，通过’urlencode()’函数将参数加入到url中
代码如下

def get_page_index(offset,keyword):
    data = {
        'offset':offset,
        'format':'json',
        'keyword':keyword,
        'autoload':'true',
        'count':'20',
        'cur_tab':3,
        'from': 'gallery'
    }
    url = 'https://www.toutiao.com/search_content/?' + urlencode(data)
    try:
        response = requests.get(url)
        if response.status_code == 200:
            return response.text
        return None
    except RequestException:
        print('请求页面出错')
        return None

2.解析返回的数据，得到图集的地址

通过上面的分析我们可以知道我们要从返回的json数据中得到article_url项的值，用json.loads()加载完数据之后，
我们先判断是否有data这个key在数据中，如果有的话就用一个for循环，从data项的数据中读出article_url项的值

下面是源代码：

#从返回的json数据中获取`article_url`的值
def parse_page_index(jsondata):
    data = json.loads(jsondata)
    if data and 'data' in data.keys():
        for item in data.get('data'):
            yield item.get('article_url')

3.请求图集的地址，获取返回的数据

通过上一步，我们已经得到了图集的地址，这一步中我们请求这个地址，并得到返回的数据

源代码如下：

#得到图集页面的数据
def get_page_detail(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            return response.text
        return None
    except RequestException:
        print('请求详情页面出错',url)
        return None

4.对图集页面的数据进行解析，获取想要的图片的地址

对返回的数据用BeautifulSoup(html,'lxml')进行解析,得到该图集的标题，然后使用正则表达式提取包含在javascipt代码中的json数据
提取出来的json数据因为含有大量的转义字符所以无法被json.loads()加载，我们利用字符串的函数replace(‘\‘,’’)将其中的转义字符消除之后再用json.loads()函数加载就没有问题了
首先先判断json数据中是否含有sub_images:这一项，如果含有这一项的话就从sub_images:的数据中提取出图片的url并返回

#对图集页面进行解析，获取图片的url
def parse_page_detail(html,url):
    soup = BeautifulSoup(html,'lxml')
    title = soup.select('title')[0].get_text()
    images_pattern = re.compile('gallery: JSON.parse\("(.*?)"\),',re.S)
    result = re.search(images_pattern,html)
    if result:
        jsonData = result.group(1).replace('\\','')
        data = json.loads(jsonData)
        if data and 'sub_images' in data.keys():
            sub_images = data.get('sub_images')
            images = [item.get('url') for item in sub_images]
            return {
                'title':title,
                'imamges':images,
                'url':url
            }

5.通过图片的url将图片下载到本地

(1).请求url，获取返回数据

这个函数返回的是response.content这个是二进制数据，如果是response.text就是文本数据，这里是图片，我们应该返回二进制数据

代码如下：

#通过url下载图片，并保存
def download_image(url):
    print('正在下载图片',url)
    try:
        response = requests.get(url)
        if response.status_code == 200:
            save_image(response.content)
        return None
    except RequestException:
        print('请求图片出错',url)
        return None

(2).将下载的图片保存到本地

#保存图片
def save_image(content):
    #利用md5创建一个文件名，再将数据写入文件
    file_path = '{0}/photo/{1}.{2}'.format(os.getcwd(),md5(content).hexdigest(),'jpg')
    if not os.path.exists(file_path):
        with open(file_path,'wb') as f:
            f.write(content)

到这里，这个爬虫的功能就算完成了

四、最后

源代码：

import json
import os
import re
from _md5 import md5
from urllib.parse import urlencode
import requests
from bs4 import BeautifulSoup
from requests.exceptions import RequestException

#通过关键字拼接url，并返回请求结果
def get_page_index(offset,keyword):
    data = {
        'offset':offset,
        'format':'json',
        'keyword':keyword,
        'autoload':'true',
        'count':'20',
        'cur_tab':3,
        'from': 'gallery'
    }
    url = 'https://www.toutiao.com/search_content/?' + urlencode(data)
    try:
        response = requests.get(url)
        if response.status_code == 200:
            return response.text
        return None
    except RequestException:
        print('请求页面出错')
        return None

#通过url获取文章
def get_page_detail(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            return response.text
        return None
    except RequestException:
        print('请求详情页面出错',url)
        return None

#解析返回的json，得到文章的url
def parse_page_index(jsondata):
    data = json.loads(jsondata)
    if data and 'data' in data.keys():
        for item in data.get('data'):
            yield item.get('article_url')

#解析返回的html，将里面包含的图片的url提取出来，并下载到本地
def parse_page_detail(html,url):
    soup = BeautifulSoup(html,'lxml')
#    title = soup.select('title')[0].get_text()
    images_pattern = re.compile('gallery: JSON.parse\("(.*?)"\),',re.S)
    result = re.search(images_pattern,html)
    if result:
        jsonData = result.group(1).replace('\\','')
        data = json.loads(jsonData)
        if data and 'sub_images' in data.keys():
            sub_images = data.get('sub_images')
            images = [item.get('url') for item in sub_images]
            for image in images:
                download_image(image)
            return {
#                'title':title,
                'imamges':images,
                'url':url
            }

#通过url下载图片，并保存
def download_image(url):
    print('正在下载图片',url)
    try:
        response = requests.get(url)
        if response.status_code == 200:
            save_image(response.content)
        return None
    except RequestException:
        print('请求图片出错',url)
        return None

#保存图片
def save_image(content):
    file_path = '{0}/photo/{1}.{2}'.format(os.getcwd(),md5(content).hexdigest(),'jpg')
    if not os.path.exists(file_path):
        with open(file_path,'wb') as f:
            f.write(content)

def main(offset):
    jsondata = get_page_index(offset,'街拍')
    for url in parse_page_index(jsondata):
        print(url)
        html = get_page_detail(url)
        result = parse_page_detail(html,url)
        print(result)


if __name__ == '__main__':
    main(0)

最后的最后，值得改进的地方当然有很多很多，但是现在水平还不够。。。