广州双城热恋创意文化传播有限公司

现在的位置: 主页 > 企业简介 > 文章列表

文章正文

python爬虫(终)

作者:广州双城热恋创意文化传播有限公司 来源:www.666loveyou.com 发布时间:2017-09-08 18:01:38
python爬虫(终)

主函数:

#coding: utf-8 ''' Created on 2016年4月21日 @author: Administrator ''' import uuid ''' 多线程爬虫 天涯杂谈 爬取4月一个月的数据 ''' import requests,re import json import time import MySQLdb from sqlUtil2 import saveTopic,saveUser,saveRelation,saveComment from multiprocessing.dummy import Pool as ThreadPool global s def getHtml(url): headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36'} html = requests.get(url,headers=headers,timeout=1)#s设置超时的时间1s html.encoding='utf-8' return html def getAttentionHtml(userId,pageNo): url='' data={ 'method' :'following.ice.select', 'params.userId': userId, 'params.pageSize':'28', 'params.pageNo':pageNo } headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36'} html = requests.get(url,headers=headers,params=data ,timeout=1)#s设置超时的时间1s html.encoding='utf-8' return html def getFansHtml(userId,pageNo): url='' data={ 'method' :'follower.ice.select', 'params.userId': userId, 'params.pageSize':'28', 'params.pageNo':pageNo } headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36'} html = requests.get(url,headers=headers,params=data ,timeout=1)#s设置超时的时间1s html.encoding='utf-8' return html def getContnetByReg(reg,text): return re.findall(reg, text, re.S) def getReplyData(url): reg=r'class="atl-item".+?class="atl-info".+?href="(.+?)".+?>(.+?).+?(.+?).+?class="bbs-content">(.+?)' dataList = getContnetByReg(reg, getHtml(url).text) return dataList def getTopicData(url): reg=r'class="s_title".+?(.+?).+?div class="atl-info".+?href="(.+?)".+?>(.+?).+?(.+?).+?(.+?).+?(.+?).+?class="atl-main".+?class="bbs-content clearfix">(.+?)' dataList = getContnetByReg(reg, getHtml(url).text) return dataList def getAuthorInfo(authorUrl): reg=r'class="relate-link".+?href="(.+?)">(.+?).+?href="(.+?)">(.+?)' dataList = getContnetByReg(reg, getHtml(authorUrl).text) return dataList def getAttentionList(userId,num): jsonstr = getAttentionHtml(userId,num).json() print getAttentionHtml(userId,num).text return jsonstr["data"]["user"] def getFansList(userId,num): jsonstr = getFansHtml(userId,num).json() print getFansHtml(userId,num).text return jsonstr["data"]["user"] def printFans(userId,num,username,conn): print '================粉丝=====================' if(num%28==0): x = num/28 else: x= num/28 + 1 #数据量太大 相对减少 if(x>=200): x=x/10 for i in range(1,x+1): print '------第',i,'页------' fansList = getFansList(userId,i) for res in fansList: try: #保存关系 relationParams = (uuid.uuid4(),res["name"],username) saveRelation(relationParams, conn) except Exception,e: print 'failed!..','exception is: ',e try: #保存用户 ISOTIMEFORMAT='%Y-%m-%d %X' grabTime = time.strftime(ISOTIMEFORMAT, time.localtime()) authorUrl = '' + str(res["id"]) userParams = (res["id"],res["fansCount"],res["followCount"],res["name"],authorUrl,grabTime) saveUser(userParams, conn) print res["id"],res["name"],res["followCount"],res["fansCount"] except Exception,e: print 'failed!..','exception is: ',e def printAttention(userId,num,username,conn): print '================关注的人=====================' if(num%28==0): x = num/28 else: x= num/28 + 1 print x for i in range(1,x+1): print '------第',i,'页------' attentList = getAttentionList(userId,i) for res in attentList: try: relationParams = (uuid.uuid4(),username,res["name"]) saveRelation(relationParams, conn) except Exception,e: print 'failed!..','exception is: ',e try: #保存用户 ISOTIMEFORMAT='%Y-%m-%d %X' grabTime = time.strftime(ISOTIMEFORMAT, time.localtime()) authorUrl = '' + str(res["id"]) userParams = (res["id"],res["fansCount"],res["followCount"],res["name"],authorUrl,grabTime) saveUser(userParams, conn) except Exception,e: print 'failed!..','exception is: ',e print res["id"],res["name"],res["followCount"],res["fansCount"] def getTopicAllInfo(topicDataList,replyDataList,authorUrl,topiclink): conn=MySQLdb.connect(host='localhost',user='root',passwd='1234',db='networkpublicopinionmap3',port=3306,charset='utf8') for topic in topicDataList: #得到发帖时间 postTime = topic[3].strip().split(':')[1] print '******',s print 'topiclink: ' , topiclink print 'topicId: ' , topiclink.split('-')[-2] print 'title: ',topic[0].strip() print 'authorLink: ',topic[1].strip() print 'authorName: ',topic[2].strip() print 'postTime: ',postTime print 'scanNum: ',topic[4].strip().split(':')[1] print 'replyNum: ',topic[5].strip().split(':')[1] print 'content: ',topic[6].strip() userId = topic[1].strip().split('/')[-1] infoList = getAuthorInfo(topic[1].strip()) #获取作者的信息(粉丝,关注等等) for info in infoList: print '\tattentionnNums: ',int(info[1].strip()) print '\tfansNum: ',int(info[3].strip()) try: #保存作者的信息 ISOTIMEFORMAT='%Y-%m-%d %X' grabTime = time.strftime(ISOTIMEFORMAT, time.localtime()) userparams = (userId,info[3].strip(),info[1].strip(),topic[2].strip(),topic[1].strip(),grabTime) saveUser(userparams, conn) except Exception,e: print 'failed!..','exception is: ',e if userId not in s: s.add(userId) if(int(info[1].strip())!=0): #保存关注的人和作者的关系 #保存关注人 printAttention(userId,int(info[1].strip()),topic[2].strip(),conn) if(int(info[3].strip())!=0): #保存粉丝和作者的关系 #保存粉丝 printFans(userId,int(info[3].strip()),topic[2].strip(),conn) try: #保存帖子 ISOTIMEFORMAT='%Y-%m-%d %X' grabTime = time.strftime(ISOTIMEFORMAT, time.localtime()) params = (topiclink.split('-')[-2],topiclink,topic[0].strip(),topic[6].strip(),topic[4].strip().split(':')[1],topic[5].strip().split(':')[1],topic[3].strip().split(':')[1],userId,grabTime) saveTopic(params,conn) except Exception,e: print 'saveTopic-failed!..','exception is: ',e for data in replyDataList: print 'replyerLink: ' , data[0].strip() print 'replyerName: ' , data[1].strip() print 'dateTime: ' , data[2].strip().split(':')[1] print 'content: ' , data[3].strip() replyerId = data[0].strip().split('/')[-1] infoList = getAuthorInfo(data[0].strip()) #获取作者的信息(粉丝,关注等等) for info in infoList: print '\tattentionnNums: ',info[1].strip() print '\tfansNum: ',info[3].strip() try: #保存作者的信息 ISOTIMEFORMAT='%Y-%m-%d %X' grabTime = time.strftime(ISOTIMEFORMAT, time.localtime()) relplyerparams = (replyerId,info[3].strip(),info[1].strip(),data[1].strip(),data[0].strip(),grabTime) saveUser(relplyerparams, conn) except Exception,e: print 'failed!..','exception is: ',e if replyerId not in s: s.add(replyerId) if(int(info[1].strip())!=0): printAttention(replyerId,int(info[1].strip()),data[1].strip(),conn) if(int(info[3].strip())!=0): printFans(replyerId,int(info[3].strip()),data[1].strip(),conn) try: #保存评论 ISOTIMEFORMAT='%Y-%m-%d %X' grabTime = time.strftime(ISOTIMEFORMAT, time.localtime()) commentParams = (uuid.uuid4(),data[3].strip(),data[2].strip().split(':')[1],topiclink.split('-')[-2],replyerId,grabTime) saveComment(commentParams,conn) except Exception,e: print 'failed!..','exception is: ',e conn.close(); def getReplyAllInfo(topicDataList,replyDataList,authorUrl,topiclink): conn=MySQLdb.connect(host='localhost',user='root',passwd='1234',db='networkpublicopinionmap3',port=3306,charset='utf8') print '............第二页的评论开始............' for data in replyDataList: print 'topiclink: ' , topiclink print 'replyerLink: ' , data[0].strip() print 'replyername: ' , data[1].strip() print 'dateTime: ' , data[2].strip().split(':')[1] print 'content: ' , data[3].strip() replyerId = data[0].strip().split('/')[-1] infoList = getAuthorInfo(data[0].strip()) #获取作者的信息(粉丝,关注等等) for info in infoList: print '\tattentionnNums: ',info[1].strip() print '\tfansNum: ',info[3].strip() try: #保存作者的信息 ISOTIMEFORMAT='%Y-%m-%d %X' grabTime = time.strftime(ISOTIMEFORMAT, time.localtime()) relplyerparams = (replyerId,info[3].strip(),info[1].strip(),data[1].strip(),data[0].strip(),grabTime) saveUser(relplyerparams, conn) except Exception,e: print 'failed!..','exception is: ',e if replyerId not in s: s.add(replyerId) if(int(info[1].strip())!=0): printAttention(replyerId,int(info[1].strip()),data[1].strip(),conn) if(int(info[3].strip())!=0): printFans(replyerId,int(info[3].strip()),data[1].strip(),conn) try: #保存评论 ISOTIMEFORMAT='%Y-%m-%d %X' grabTime = time.strftime(ISOTIMEFORMAT, time.localtime()) comment2Params = (uuid.uuid4(),data[3].strip(),data[2].strip().split(':')[1],topiclink.split('-')[-2],replyerId,grabTime) saveComment(comment2Params,conn) except Exception,e: print 'saveComment()-failed!..','exception is: ',e conn.close(); def spider(url): originalUrl = '' authorUrl = '' reg=r'(.+?)' regLink = r'div class="links".+?.+?href="(.+?)"' html = getHtml(url) nextLink = getContnetByReg(regLink, html.text) print 'nextLink: ', originalUrl + nextLink[0] n=1 while(nextLink[0]): print '...............第',n,'页..................' contentList = getContnetByReg(reg, html.text) for content in contentList: resreg = r'class="td-title faceblue">.+?href="(.+?)".+?(.+?).+?(.+?)'#判断postTimr的正则表达式 postTimeList = getContnetByReg(postTimeReg, pageHtml.text) postTime= postTimeList[0].strip().split(':')[1] print 'postTime: ',postTime if(postTime.startswith('2016-03')): print 'end..' return if(not postTime.startswith('2016-04')): print 'continue...' continue print 'start..' #获取帖子的信息 replyDataList = getReplyData(originalUrl + result[0].strip())# topicDataList = getTopicData(originalUrl + result[0].strip()) print '=================================================' #先判断有没有页码 isPageReg=r'class="atl-head".+?

企业建站2800元起,携手武汉肥猫科技,做一个有见地的颜值派!更多优惠请戳:湖北SEO http://hubei.raoyu.net


COPYRIGHT © 2015 广州双城热恋创意文化传播有限公司 ALL RIGHTS RESERVED.
网站地图 技术支持:肥猫科技
精彩专题:网站建设
购买本站友情链接、项目合作请联系客服QQ:2500-38-100