import re import time import jieba import pymysql import requests import pandas as pd from lxml import etree from snownlp import SnowNLP
class guba(): def __init__(self, host, db, user, passwd): self.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, ' 'like Gecko) Chrome/89.0.4389.90 Safari/537.36 Edg/89.0.774.54'} self.host = host self.db = db self.user = user self.passwd = passwd self.dataoutput = DataOutput() self.ip_num = 1
def get_new_ip(self): if self.ip_num <= 1000: ip_port = requests.get( '获取代理的api', timeout=6) ip = ip_port.text.replace('\r\n', '') proxyip = {"http": "http://" + ip, "https": "https://" + ip} self.ip_num += 1 else: return None
return proxyip
def rm_special_letters(self, old_list): new_list = [] for i in old_list: i = i.replace('\r\n', '') i = i.replace(' ', '') new_list.append(i) return new_list
def date_to_timestamp(self, year, timestr): mdate = str(year) + '-' + timestr time_array = time.strptime(mdate, "%Y-%m-%d") news_timestamp = time.mktime(time_array) return news_timestamp
def dangu_pinglun(self, url, company_name, industry): """ :param 所属板块: :param 公司名称: :type url: 股吧首页链接 """ global mtimestamp mtimestamp = time.time() page = 1 year = 2021 latest_mounth = 12 proxyip = self.get_new_ip() while True: datalist = [] try: if page % 50 == 0: print(company_name, page, "----" + str(time.time())) proxyip = self.get_new_ip() murl = url + str(page) + '.html' resp = requests.get(murl, headers=self.headers, proxies=proxyip, timeout=10) htmltree = etree.HTML(resp.text) yuedu_count = htmltree.xpath('//span[@class="l1 a1"]/text()') yuedu_count = self.rm_special_letters(yuedu_count)[1:] pinglun_count = htmltree.xpath('//span[@class="l2 a2"]/text()') pinglun_count = self.rm_special_letters(pinglun_count)[1:] title_list = htmltree.xpath('//span[@class="l3 a3"]/a/@title') username_list = htmltree.xpath('//span[@class="l4 a4"]/a//text()') last_time_list = htmltree.xpath('//span[@class="l5 a5"]/text()')[1:]
for num, p in enumerate(pinglun_count): if re.search('[\u4e00-\u9fa5]', yuedu_count[num]): yuedu_count[num] = 20000 if re.search('[\u4e00-\u9fa5]', pinglun_count[num]): pinglun_count[num] = 20000 lastdate = last_time_list[num].split(' ')[0] if int(lastdate.split('-')[0]) > latest_mounth: year -= 1 mtimestamp = self.date_to_timestamp(year, lastdate) info_dict = {'scan': yuedu_count[num], 'comment_num': pinglun_count[num], 'title': title_list[num], 'username': username_list[num], 'mdate': mtimestamp, 'company': company_name, 'industry': industry} datalist.append(info_dict) latest_mounth = int(lastdate.split('-')[0]) page += 1
self.dataoutput.write_to_mysql(host=self.host, db=self.db, user=self.user, passwd=self.passwd, datalist=datalist) time.sleep(1) except Exception as e: print(industry, company_name, page, "---" + str(time.time())) print(str(e)) if 'HTTPConnectionPool' in str(e): proxyip = self.get_new_ip() if 'index out of range' in str(e): page += 1 elif 'day is out of range for month' in str(e): page += 1 if mtimestamp <= 1521475200: print('时间到') break
class DataOutput(): def __init__(self): self.__tablename = 'info_guba' self.__tablekeys = '(myid,scans,comments,titles,usernames,mdates,f_scores,company_name,industry)' def rm_special_letter(self, line): for i in ["\'", "\"", "#", "\\"]: line = line.replace(i, "") return line """借助snownlp 分析news的情绪分为3级 0:积极 1:中立 2:消极""" def feeling(self, line): try: res = SnowNLP(line) f_score = res.sentiments except: f_score = 0 return f_score
def __rm_stopwords(self, wordlist): new_wordlist = [] with open('tool_files/stopwords.txt', 'r', encoding='utf-8') as r: stopwords = r.read() for i in wordlist: if i in stopwords: continue else: new_wordlist.append(i) return new_wordlist """使用玻森情感词典 计算情绪指数""" def feeling2(self, line): path = "tool_files/sentiment_score.txt" df = pd.read_table(path, sep=" ", names=['key', 'score_snownlp']) key = df['key'].values.tolist() score = df['score_snownlp'].values.tolist()
def getscore(line): segs = jieba.lcut(line) jieba.load_userdict('tool_files/userdict.txt') segs = self.__rm_stopwords(segs) score_list = [score[key.index(x)] for x in segs if (x in key)] if len(score_list) != 0: sums = 0 for i in score_list: sums = sums + float(i) return sums / len(score_list) else: return 0
last_score = getscore(line) if last_score == 0: return 0 else: return round(last_score, 5) def __mysql_data_rechecking(self, item, ids_inmysql): id_inmysqls = [myid[0] for myid in ids_inmysql] title = self.rm_special_letter(item['title']) myid = item['username'] + str(item['mdate'])[3:-4] + title[:100] if myid not in id_inmysqls: return 'newrecord', title, myid else: return '数据已存在'
def write_to_mysql(self, datalist, host, db, user, passwd): db = pymysql.connect(host=host, user=user, password=passwd, database=db) cursor = db.cursor() quchong_sql = 'SELECT myid FROM {}'.format(self.__tablename) cursor.execute(quchong_sql) myids = cursor.fetchall() for item in datalist: data = self.__mysql_data_rechecking(item, myids) if data[0] == 'newrecord': title, myid = data[1], data[2] feeling = 0 sql = "INSERT INTO {TABLENAME}{keys}" \ "VALUES ('{v0}','{v1}','{v2}','{v3}','{v4}','{v5}','{v6}','{v7}','{v8}')".format \ (TABLENAME=self.__tablename, keys=self.__tablekeys, v0=myid, v1=item['scan'], v2=item['comment_num'], v3=title, v4=item['username'], v5=item['mdate'], v6=feeling, v7=item['company'], v8=item['industry']) try: cursor.execute(sql) db.commit() except Exception as e: if 'PRIMARY' in str(e): print('查重失败') else: print(item) print(str(e) + "---" + str(time.time())) db.rollback() db.close()
data01 = { '批发和零售业': ['大参林 603233', '广百股份 002187', '来伊份 603777'], '制造业': ['中国中车 601766', '永兴材料 002756', '海思科 002653'], '房地产业': ['格力地产 600185', '绿景控股 000502', '万科A 000002'],
'租赁和商务服务业': ['深圳华强 000062', '渤海租赁 000415', '轻纺城 600790'], '采矿业': ['兴业矿业 000426', '冀中能源 000937', '中国石化 600028'], '交通运输、仓储和邮政业': ['中远海控 601919', '宜昌交运 002627', '大众交通 600611'], '信息传输、软件和信息技术服务业': ['恒生电子 600570', '中国联通 600050', '恒华科技 300365'],
'教育': ['好未来 ustal', '中公教育 002607', '紫光学大 000526'], '卫生和社会工作业': ['通策医疗 600763', '迪安诊断 300244', '爱尔眼科 300015'], '文化、体育和娱乐业': ['凤凰传媒 601928', '新华传媒 600825', '长江传媒 600757'], '金融业': ['民生银行 600016', '中国平安 601318', '国信证券 002736'],
'建筑业': ['棕榈股份 002431', '上海建工 600170', '隧道股份 600820'], '电力、热力、燃气及水的生产和供应业': ['滨海能源 000695', '太阳能 000591', '上海电力 600021'], '水利、环境和公共设施管理业': ['远达环保 600292', '碧水源 300070', '启迪环境 000826'] }
if __name__ == '__main__': gb = guba(host='localhost', db='guba', user='root', passwd='root') for item in data: for num, company in enumerate(data[item]): stock_code = company.split(' ')[1] name = company.split(' ')[0] url = 'http://guba.eastmoney.com/list,' + str(stock_code) + ',f_' gb.dangu_pinglun(url, name, item)