這篇文章將為大家詳細講解有關Python怎么爬取人人網新鮮事,文章內容質量較高,因此小編分享給大家做個參考,希望大家閱讀完這篇文章后對相關知識有一定的了解。
Python實現登錄人人網并抓取新鮮事的方法:
from sgmllib import SGMLParser
import sys,urllib2,urllib,cookielib
class spider(SGMLParser):
def __init__(self,email,password):
SGMLParser.__init__(self)
self.h4=False
self.h4_is_ready=False
self.div=False
self.h4_and_div=False
self.a=False
self.depth=0
self.names=""
self.dic={}
self.email=email
self.password=password
self.domain='renren.com'
try:
cookie=cookielib.CookieJar()
cookieProc=urllib2.HTTPCookieProcessor(cookie)
except:
raise
else:
opener=urllib2.build_opener(cookieProc)
urllib2.install_opener(opener)
def login(self):
url='http://www.renren.com/PLogin.do'
postdata={
'email':self.email,
'password':self.password,
'domain':self.domain
}
req=urllib2.Request(
url,
urllib.urlencode(postdata)
)
self.file=urllib2.urlopen(req).read()
#print self.file
def start_h4(self,attrs):
self.h4 = True
def end_h4(self):
self.h4=False
self.h4_is_ready=True
def start_a(self,attrs):
if self.h4 or self.div:
self.a=True
def end_a(self):
self.a=False
def start_div(self,attrs):
if self.h4_is_ready == False:
return
if self.div==True:
self.depth += 1
for k,v in attrs:
if k == 'class' and v == 'content':
self.div=True;
self.h4_and_div=True #h4 and div is connected
def end_div(self):
if self.depth == 0:
self.div=False
self.h4_and_div=False
self.h4_is_ready=False
self.names=""
if self.div == True:
self.depth-=1
def handle_data(self,text):
#record the name
if self.h4 and self.a:
self.names+=text
#record says
if self.h4 and (self.a==False):
if not text:pass
else: self.dic.setdefault(self.names,[]).append(text)
return
if self.h4_and_div:
self.dic.setdefault(self.names,[]).append(text)
def show(self):
type = sys.getfilesystemencoding()
for key in self.dic:
print ( (''.join(key)).replace(' ','')).decode('utf-8').encode(type), \
( (''.join(self.dic[key])).replace(' ','')).decode('utf-8').encode(type)
renrenspider=spider('your email','your password')
renrenspider.login()
renrenspider.feed(renrenspider.file)
renrenspider.show()關于Python怎么爬取人人網新鮮事就分享到這里了,希望以上內容可以對大家有一定的幫助,可以學到更多知識。如果覺得文章不錯,可以把它分享出去讓更多的人看到。
免責聲明:本站發布的內容(圖片、視頻和文字)以原創、轉載和分享為主,文章觀點不代表本網站立場,如果涉及侵權請聯系站長郵箱:is@yisu.com進行舉報,并提供相關證據,一經查實,將立刻刪除涉嫌侵權內容。