抓取豆瓣热门电影影评

这个是自己随便写的,练一下

bug当然有很多,很多的异常处理没做

config.py

1
2
3
4
START_URL = 'https://movie.douban.com/j/search_subjects?'
TAG = '热门'
LIMIT = 20
SORT = 'recommend'

spider.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import json
import os
from urllib.parse import urlencode
from config import *
import requests
from pyquery import PyQuery as pq


headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.79 Safari/537.36'
}

def generate_index_url(tag,start,limit):
data = {
'type': 'movie',
'tag': tag,
'sort': 'recommend',
'page_limit': limit,
'page_start': start,
}
return START_URL + urlencode(data)

def generate_comment_url(movie_url,start,limit):
data = {
'start': start,
'limit': limit,
'sort': 'new_score',
'status': 'P',
'comment_only': 1
}
return movie_url + 'comments?' + urlencode(data)


def get_data(url):
try:
#ip被封的时候就换一个
response = requests.get(url,headers=headers)
# response = requests.get(url, headers=headers, proxies=get_proxy())
if response.status_code == 200:
return response.text
return None
except ConnectionError as e:
print(e)

def get_proxy():
return {
'http':'socks5://127.0.0.1:1080',
'https': 'socks5://127.0.0.1:1080',
}

# 获取电影的地址
def parse_index(json_data):
data = json.loads(json_data)
if data and 'subjects' in data.keys():
movies = data.get('subjects')
for movie in movies:
yield {
"title": movie.get('title'),
"rate": movie.get('rate'),
"cover": movie.get('cover'),
"url": movie.get('url'),
"id": movie.get('id'),
"is_new": movie.get('is_new'),
"playable": movie.get('playable')
}

def parse_comment(html):
doc = pq(html)
items = doc('#comments .comment-item').items()
for item in items:
yield {
'avatar': item.find('.avatar a img').attr('src'),
'comment': item.find('.comment p span').text(),
'votes': item.find('.comment h3 .comment-vote .votes').text(),
'rating': item.find('.comment h3 .comment-info .rating').attr('title'),
'comment_time': item.find('.comment h3 .comment-info span.comment-time').attr('title')
}

#写入文件中
def save_comment(title,content):
file_path = file_path = '{0}/movie_comment/'.format(os.getcwd())
mkdir(file_path)
file_name = file_path + title + '.txt'
with open(file_name,'a',encoding='utf-8') as f:
f.write(json.dumps(content,ensure_ascii=False) + '\n')

#创建目录
def mkdir(path):
folder = os.path.exists(path)
if not folder: # 判断是否存在文件夹如果不存在则创建为文件夹
os.makedirs(path) # makedirs 创建文件时如果路径不存在会创建这个路径


def main(start):
try:
url = generate_index_url(TAG,start,LIMIT)
for item in parse_index(get_data(url)):
url = item['url']
title = item['title']
print(item)
for start in [i*20 for i in range(0,10)]:
html = get_data(generate_comment_url(url,start,LIMIT))
for item in parse_comment(html):
save_comment(title,item)
except TypeError as e:
print(e)

if __name__ == '__main__':
for start in [i*20 for i in range(0,10)]:
main(start)