导读:《我不是药神》是由文牧野执导,徐峥、王传君、周一围、谭卓、章宇、杨新鸣等主演的喜剧电影,于 2018 年 7 月 6 日在中国上映。
影片在未上映前,大规模的点映积攒了相当高的人气和口碑, 截止 7 月 9 日凌晨:豆瓣评分:9.0 分,猫眼:9.7 分,淘票票:9.5 分,时光网:8.8 分 。
为什么我说这三个网站呢,因为我们今天近 5000+ 条短评数据就来自于此,用专业的数据更有说服力。
盛世危言,却让人能看到希望,这部影片极有可能成为 2018 年最具有爆炸性的话题。这也许就是未播先火,豆瓣 16 年后首部 9.0 高分电影的原因。
今天我们用 5000+ 条数据来分析一下,哪些地区,什么样的人,喜欢这部电影。
正版药「瑞士格列宁」非常昂贵,普通人家根本供应不起,但在印度有一款仿制药「印度格列宁」价格却只有 1/20,但在中国是属于禁药,走私被抓,是需要负法律责任的。
截止 7 月 9 日凌晨,累积票房超过 13 个亿,占当天票房近 84%。
首先是豆瓣,豆瓣自从去年 10 月份已经全面禁止爬取数据,仅仅放出 500 条数据,豆瓣封 IP,白天一分钟可以访问 40 次,晚上一分钟可以访问 60 次,超过限制次数就会封 IP。
import urllib import requests from urllibimport request import time header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win32; x32; rv:54.0) Gecko/20100101 Firefox/54.0', 'Connection': 'keep-alive'} cookies = 'v=3; iuuid=1A6E888B4A4B29B16FBA1299108DBE9CDCB327A9713C232B36E4DB4FF222CF03; webp=true; ci=1%2C%E5%8C%97%E4%BA%AC; __guid=26581345.3954606544145667000.1530879049181.8303; _lxsdk_cuid=1646f808301c8-0a4e19f5421593-5d4e211f-100200-1646f808302c8; _lxsdk=1A6E888B4A4B29B16FBA1299108DBE9CDCB327A9713C232B36E4DB4FF222CF03; monitor_count=1; _lxsdk_s=16472ee89ec-de2-f91-ed0%7C%7C5; __mta=189118996.1530879050545.1530936763555.1530937843742.18' def html_prase(url): r = requests.get(url).content return r cookie = {} for line in cookies.split(';'): name, value = cookies.strip().split('=', 1) cookie[name] = value def html_prase(url): r = requests.get(url).content return r for iin range(1, 100): print('正在打印第%s页' % i) try: url= 'http://m.maoyan.com/mmdb/comments/movie/1200486.json"http": "http://{}".format(proxy)}).content data = json.loads(html.decode('utf-8'))['cmts'] for item in data: comment = item['content'] date = item['time'].split(' ')[0] rate = item['score'] city = item['cityName'] img= item['avatarurl'] print(date, rate, comment, city, ) with open('maoyan_08.txt', 'a', encoding='utf-8') as f: f.write(date + ',' + str(rate) + ',' + comment + ',' + comment + ',' + city + '\n') if img: f = open('C:\\Users\My\Desktop\yaoshen\img\\' + img.split('/')[-1], 'wb') f.write((urllib.request.urlopen(img)).read()) except: continue time.sleep(5 + float(random.randint(1, 100)) / 20)
(Anyproxy+JS+Python+Monkeyrunner),可以爬取 Web 静态网站、App 应用、JS 渲染数据的动态网站的数据都可以进行爬取。
安装使用,请查阅官方 Github:
JS 代码:
var logMap = {} var fs = require('fs'); var iconv = require('iconv-lite'); var logger = fs.createWriteStream('./urlLog.log', { flags: 'a' // 'a' means appending (old data will be preserved) }) function logPageFile(url) { if (!logMap[url]) { logMap[url] = true; logger.write(url + '\r\n'); } } function postData(post_data, path, cb) { // // Build the post string from an object // var post_data = JSON.stringify({ // 'data': data // }); // An object of options to indicate where to post to var post_options = { host: '', port: '9999', path: '/' + path, method: 'POST', headers: { 'Content-Type': 'application/json', 'Content-Length': Buffer.byteLength(post_data) } }; var http = require('http'); // Set up the request var post_req = http.request(post_options, function (res) { res.setEncoding('utf8'); res.on('data', cb); }); logger.write('request post data 1\r\n') // post the data post_req.write(post_data); logger.write('request post data 2\r\n') post_req.end(); } module.exports = { summary: 'a rule to modify response', * beforeSendResponse(requestDetail, responseDetail) { if (/movie\/1200486/i.test(requestDetail.url)) { logger.write('matched: ' + requestDetail.url + '\r\n'); if (responseDetail.response.toString() !== "") { logger.write(responseDetail.response.body.toString()); var post_data = JSON.stringify({ 'url': requestDetail.url, 'body': responseDetail.response.body.toString() }); logger.write("post comment to server -- ext"); postData(post_data, 'douban_comment', function (chunk) { }); } } }, };
使用 AnyProxy 加载 JS 代码:anyproxy -i --rule wxrule.js
Service 代码部分:
#!/usr/bin/env python3 import asyncio import re import textwrap import threading import time import os import pymysql from mysqlmgrimport MysqlMgr from mongomgrimport MongoManager from subprocess import call import requests from lxmlimport etree from lxmlimport html from aiohttp.webimport Application, Response, StreamResponse, run_app import json STATE_RUNNING = 1 STATE_IN_TRANSACTION = 2 running_state= 0 run_swipe= True last_history_time= time.clock() # A thread to save data to database in background def insert_to_database(biz, msglist): try: for msg in msglist: print(biz) print(msg['comm_msg_info']['id']) mongo_mgr.enqueue_data(msg['comm_msg_info']['id'], biz, msg ) except Exception as e: print(e) def save_data(biz, msglist_str): save_thread= threading.Thread(target=insert_to_database, args=(biz, msglist_str,)) save_thread.setDaemon(True) save_thread.start() def swipe_for_next_page(): while run_swipe: time.sleep(5) if time.clock() - last_history_time>120: if running_state== STATE_RUNNING: reenter() continue call(["adb", "shell", "input", "swipe", "400", "1000", "400", "200"]) def reenter(): global running_state running_state= STATE_IN_TRANSACTION # 模拟侧滑实现返回上一页 call(["adb", "shell", "input", "swipe", "0", "400", "400", "400"]) time.sleep(2) # 点击"进入历史消息",每个手机的位置不一样,需要单独设置 X 和 Y call(["adb", "shell", "input", "tap", "200", "1200"]) time.sleep(2) header={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:54.0) Gecko/20100101 Firefox/54.0','Connection':'keep-alive'} def html_prase(url): r = requests.get(url,header).content return html.fromstring(r) async def report_url(request): resp = StreamResponse() data = await request.json() url= data['url'] # print("url reported: " + url) biz = re.findall('__biz=(.*""" Type {url}/hello/John {url}/simple or {url}/change_body in browser url bar """).format(url='') binary = txt.encode('utf8') resp = StreamResponse() resp.content_length= len(binary) resp.content_type= 'text/plain' await resp.prepare(request) resp.write(binary) return resp async def simple(request): return Response(text="Simple answer") async def change_body(request): resp = Response() resp.body= b"Body changed" resp.content_type= 'text/plain' return resp # coding=utf-8 async def app_douban_comment(request): resp = StreamResponse() data = await request.json() global running_state global last_history_time msg_data= json.loads(data['body'])['data']['cts'] for item in msg_data: comment = item['ce'].strip().replace('\n','') rate = item['cr'] print(comment, rate) with open('date_rate_comment_sg.txt', 'a', encoding='utf-8') as f: f.write('2018-07-06' + ',' + str(rate) + ',' + comment + '\n') last_history_time= time.clock() resp.content_type= 'text/plain' await resp.prepare(request) await resp.write_eof() return resp last_history_time= time.clock() resp.content_type= 'text/plain' await resp.prepare(request) await resp.write_eof() return resp async def init(loop): app = Application() app.router.add_get('/', intro) app.router.add_post('/url', report_url) app.router.add_post('/douban_comment', app_douban_comment) return app def start_swipe_thread(): try: t = threading.Thread( target=swipe_for_next_page, name='swipe') # set daemon so main thread can exit when receives ctrl-c t.setDaemon(True) t.start() except Exception: print("Error: unable to start thread") loop = asyncio.get_event_loop() app = loop.run_until_complete(init(loop)) run_app(app, host='', port=9999)
这是示例代码,实际使用过程,需要进行微调。获取猫眼数据,最难是难在找猫眼 App 的数据接口。
import json import random import urllib import requests from urllibimport request import time header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win32; x32; rv:54.0) Gecko/20100101 Firefox/54.0', 'Connection': 'keep-alive'} cookies ='v=3; iuuid=1A6E888B4A4B29B16FBA1299108DBE9CDCB327A9713C232B36E4DB4FF222CF03; webp=true; ci=1%2C%E5%8C%97%E4%BA%AC; __guid=26581345.3954606544145667000.1530879049181.8303; _lxsdk_cuid=1646f808301c8-0a4e19f5421593-5d4e211f-100200-1646f808302c8; _lxsdk=1A6E888B4A4B29B16FBA1299108DBE9CDCB327A9713C232B36E4DB4FF222CF03; monitor_count=1; _lxsdk_s=16472ee89ec-de2-f91-ed0%7C%7C5; __mta=189118996.1530879050545.1530936763555.1530937843742.18' cookie = {} for line in cookies.split(';'): name, value = cookies.strip().split('=', 1) cookie[name] = value def html_prase(url): r = requests.get(url).content return r for iin range(1, 100): print('正在打印第%s页' % i) try: url= 'http://m.maoyan.com/mmdb/comments/movie/1200486.json"htmlcode">from pyechartsimport Style from pyechartsimport Geo city =[] with open('maoyan.txt', mode='r', encoding='utf-8') as f: rows = f.readlines() for row in rows: if len(row.split(',')) == 5: city.append(row.split(',')[4].replace('\n','')) def all_list(arr): result = {} for iin set(arr): result[i] = arr.count(i) return result data = [] for item in all_list(city): data.append((item,all_list(city)[item])) style = Style( title_color="#fff", title_pos="center", width=1200, height=600, background_color='#404a59' ) geo = Geo( "《我不是药神》评论人群地理位置","数据来源:知乎ID:布道", **style.init_style) attr, value = geo.cast(data) geo.add("", attr, value, visual_range=[0, 100], visual_text_color="#fff", is_legend_show=False, symbol_size=20, is_visualmap=True, tooltip_formatter='{b}', label_emphasis_textsize=15, label_emphasis_pos='right') geo.render()每天爬取数据量代码:
from pyechartsimport EffectScatter from pyechartsimport Style style= Style( title_color="#191970", title_pos="left", width=900, height=450, background_color='#F8F8FF' ) es = EffectScatter("《我不是药神》短评数据情况","数据来源:知乎ID:布道", **style.init_style) es.add("", [1], [270], symbol_size=20, effect_scale=4, effect_period=5, symbol="pin") es.add("", [2], [606], symbol_size=20, effect_scale=4, effect_period=5, symbol="pin") es.add("", [3], [542], symbol_size=20, effect_scale=4, effect_period=5, symbol="pin") es.add("", [4], [550], symbol_size=20, effect_scale=4, effect_period=5, symbol="pin") es.add("", [5], [656], ssymbol_size=20, effect_scale=4, effect_period=5, symbol="pin") es.add("", [6], [850], ssymbol_size=20, effect_scale=4, effect_period=5, symbol="pin") es.add("", [7], [993], symbol_size=20, effect_scale=4, effect_period=5, symbol="pin") es.add("", [8], [903], symbol_size=20, effect_scale=4, effect_period=5, symbol="pin") es.render()五星推荐河流图代码:
from pyechartsimport Style from pyechartsimport ThemeRiver data = [ ['2018/07/08', 802, '五星'], ['2018/07/08', 28, '四星'], ['2018/07/08', 9, '三星'], ['2018/07/08',8, '二星'], ['2018/07/08', 4, '一星'], ['2018/07/07',802, '五星'], ['2018/07/07',166, '四星'], ['2018/07/07',17, '三星'],['2018/07/07',0, '二星'],['2018/07/07',8, '一星'], ['2018/07/06', 667, '五星'], ['2018/07/06', 156, '四星'], ['2018/07/06', 13, '三星'], ['2018/07/06', 10, '二星'],['2018/07/06', 4, '一星'], ['2018/07/05', 567, '五星'], ['2018/07/05', 76, '四星'], ['2018/07/05', 13, '三星'], ['2018/07/05', 0, '二星'],['2018/07/05', 0, '一星'], ['2018/07/04', 467, '五星'], ['2018/07/04', 67, '四星'], ['2018/07/04', 16, '三星'], ['2018/07/04', 0, '二星'],['2018/07/04', 0, '一星'], ['2018/07/03', 478, '五星'], ['2018/07/03', 56, '四星'], ['2018/07/03', 8, '三星'], ['2018/07/03', 0, '二星'],['2018/07/03', 0, '一星'], ['2018/07/02', 531, '五星'], ['2018/07/02', 67, '四星'], ['2018/07/02', 8, '三星'], ['2018/07/02', 0, '二星'],['2018/07/02', 0, '一星'], ['2018/07/01', 213, '五星'], ['2018/07/01', 45, '四星'], ['2018/07/01', 5, '三星'], ['2018/07/01', 1, '二星'], ['2018/07/01', 1, '一星'], ] style = Style( title_color="#191970", title_pos="left", width=1200, height=600, background_color='#F8F8FF' ) tr = ThemeRiver("《我不是药神》星级推荐","数据来源:知乎ID:布道", **style.init_style) tr.add(['五星', '四星', '三星', '二星', '一星',], data, is_label_show=True) tr.render()词云图:
import pickle from osimport path import jieba import matplotlib.pyplotas plt from wordcloudimport WordCloud, STOPWORDS, ImageColorGenerator def make_worldcloud(file_path): text_from_file_with_apath= open(file_path,'r',encoding='UTF-8').read() wordlist_after_jieba= jieba.cut(text_from_file_with_apath, cut_all=False) wl_space_split= " ".join(wordlist_after_jieba) print(wl_space_split) backgroud_Image= plt.imread('./1.jpg') print('加载图片成功!') '''设置词云样式''' stopwords= STOPWORDS.copy() stopwords.add("哈哈") stopwords.add("电影") stopwords.add("真的") stopwords.add("就是") stopwords.add("真是") stopwords.add("中国") stopwords.add("没有") stopwords.add("可以") stopwords.add("一部") stopwords.add("还是") stopwords.add("最后") stopwords.add("一个") #可以加多个屏蔽词#可以加多个屏蔽词 wc= WordCloud( width=1024, height=768, background_color='white',# 设置背景颜色 mask=backgroud_Image,# 设置背景图片 font_path='E:\simsun.ttf', # 设置中文字体,若是有中文的话,这句代码必须添加,不然会出现方框,不出现汉字 max_words=600, # 设置最大现实的字数 stopwords=stopwords,# 设置停用词 max_font_size=400,# 设置字体最大值 random_state=50,# 设置有多少种随机生成状态,即有多少种配色方案 ) wc.generate_from_text(wl_space_split)#开始加载文本 img_colors= ImageColorGenerator(backgroud_Image) wc.recolor(color_func=img_colors)#字体颜色为背景图片的颜色 plt.imshow(wc)# 显示词云图 plt.axis('off')# 是否显示x轴、y轴下标 plt.show()#显示 # 获得模块所在的路径的 d = path.dirname(__file__) # os.path.join(): 将多个路径组合后返回 wc.to_file(path.join(d, "h11.jpg")) print('生成词云成功!') make_worldcloud('cloud.txt')图像画像代码:
import os from math import sqrt from PIL import Image #path是存放好友头像图的文件夹的路径 path = 'C:\\Users\My\Desktop\yaoshen\img\\' pathList= [] for item in os.listdir(path): imgPath= os.path.join(path,item) pathList.append(imgPath) total = len(pathList)#total是好友头像图片总数 line = int(sqrt(total))#line是拼接图片的行数(即每一行包含的图片数量) NewImage= Image.new('RGB', (128*line,128*line)) x = y = 0 for item in pathList: try: img= Image.open(item) img= img.resize((128,128),Image.ANTIALIAS) NewImage.paste(img, (x * 128 , y * 128)) x += 1 except IOError: print("第%d行,%d列文件读取失败!IOError:%s" % (y,x,item)) x -= 1 if x == line: x = 0 y += 1 if (x+line*y) == line*line: break NewImage.save(path+"final.jpg")好了,就给大家介绍到这里,希望对大家有所帮助。!
