标签(空格分隔): 数据可视化 分词 大数据 地图
项目简介:
- 通过爬虫爬取用户发布的状态(约10万条),生成词云图
- 通过爬虫爬取的用户地理位置信息(约5万用户信息),生成用户分布地理位置图
1.词云
关键代码块如下: '''从数据库中读取用户发布的状态'''
def fetchData(self,start,len): #从数据库获取内容
try:
with self.db.cursor() as cursor:
sql = 'select content from keephot_new_copy limit %d,%d' % (start,len)
cursor.execute(sql)
result = cursor.fetchall()
# print(result)
return result
except Exception as e:
print("获取数据库数据失败"+str(e))
self.db.rollback()
return ""
def getDataLen(self): #获取数据库中内容总条数
try:
with self.db.cursor() as cursor:
sql = 'select count(*) from keephot_new_copy '
cursor.execute(sql)
result = cursor.fetchone()
# print(result)
return result
except Exception as e:
print("获取数据长度失败"+str(e))
self.db.rollback()
return 0
for i in range(int(int(len)/100)): #将数据库中获取的内容写入.txt文件中
f = open('hotcontent.txt','a',encoding='utf-8')
for item in test.fetchData(i*100,100):
content = item[0]
# for stopword in stopwords:
# content = content.replace(stopword," ")
f.write(content+"\n")
f.close()
'''用结巴分词完成分词,统计词频'''
text =''
f = open('hotcontent.txt',encoding='UTF-8')
text=f.read()
f.close()
jieba.analyse.set_stop_words("stopwords.txt") #过滤停用词
word_list = jieba.analyse.extract_tags(text, topK=1000,allowPOS=('ns', 'n', 'vn', 'v'))
result = " ".join(word_list)
'''用wordcloud开源库绘制词云'''
alice_coloring = np.array(Image.open(path.join(d, "wukong.jpg"))) #设置背景图片
wc = wordcloud.WordCloud(background_color="white",
max_words=500,
mask=alice_coloring,
max_font_size=80,
random_state=42,
font_path='simhei.ttf')
# generate word cloud
my_wordcloud = wc.generate(result)
# create coloring from image
image_colors = wordcloud.ImageColorGenerator(alice_coloring)
# show
plt.figure()
# recolor wordcloud and show
plt.imshow(wc.recolor(color_func=image_colors), interpolation="bilinear")
plt.axis("off")
plt.show()
wc.to_file('wukong.png')
生成的效果图如下:
2.用户地理位置分布可视化
开源库pyecharts用于可视化地理位置、langconv用于中文繁体简体转换
关键代码块如下:
'''从数据库获取用户地理位置信息'''
def get_map(self):
map_info = []
try:
with self.db.cursor() as cursor:
sql = 'select city,count(*) from user_info_copy where country="**" group by city order by count(*) desc'
cursor.execute(sql)
result = cursor.fetchall()
for item in result:
if item[0] != "" and item[0] != '地球' and item[0] != '**省':
cityname = item[0].replace("市","")
cityname = cityname.replace("特别行政区","")
cityname = Converter('zh-hans').convert(cityname) #将繁体地理位置转换为简体
value = (cityname,item[1])
map_info.append(value)
# print(result)
# print(map_info)
return map_info
except Exception as e:
print("获取用户地理位置失败"+str(e))
self.db.rollback()
return ""
'''绘制注册用户分布地图'''
def plot_map(self,data):
geo = Geo("keep注册用户地理位置分布", "keepers in China", title_color="#fff",
title_pos="center", width=1200,
height=600, background_color='#404a59')
attr, value = geo.cast(data)
geo.add("", attr, value, visual_range=[0, 200], visual_text_color="#fff",
symbol_size=15, is_visualmap=True)
geo.render()
最终的生成html网页,即仓库中的render.html,效果图如下: