# 获取电影的地址 defparse_index(json_data): data = json.loads(json_data) if data and'subjects'in data.keys(): movies = data.get('subjects') for movie in movies: yield { "title": movie.get('title'), "rate": movie.get('rate'), "cover": movie.get('cover'), "url": movie.get('url'), "id": movie.get('id'), "is_new": movie.get('is_new'), "playable": movie.get('playable') }
defparse_comment(html): doc = pq(html) items = doc('#comments .comment-item').items() for item in items: yield { 'avatar': item.find('.avatar a img').attr('src'), 'comment': item.find('.comment p span').text(), 'votes': item.find('.comment h3 .comment-vote .votes').text(), 'rating': item.find('.comment h3 .comment-info .rating').attr('title'), 'comment_time': item.find('.comment h3 .comment-info span.comment-time').attr('title') }
#写入文件中 defsave_comment(title,content): file_path = file_path = '{0}/movie_comment/'.format(os.getcwd()) mkdir(file_path) file_name = file_path + title + '.txt' with open(file_name,'a',encoding='utf-8') as f: f.write(json.dumps(content,ensure_ascii=False) + '\n')
defmain(start): try: url = generate_index_url(TAG,start,LIMIT) for item in parse_index(get_data(url)): url = item['url'] title = item['title'] print(item) for start in [i*20for i in range(0,10)]: html = get_data(generate_comment_url(url,start,LIMIT)) for item in parse_comment(html): save_comment(title,item) except TypeError as e: print(e)
if __name__ == '__main__': for start in [i*20for i in range(0,10)]: main(start)