请确保D盘有一个 image文件夹 用于存放采集到的图片 import requestsfrom lxml import etreeimport time  class Baiduspider(object):  def __init__(self):    self.baseurl = 'https://tieba.baidu.com/'    self.url = 'https://tieba.baidu.com/f?'    self.headers = {'User-Agent':'Mozilla/5.0'}     #获取帖子链接  def getPageUrl(self,url):    res = requests.get(url,headers=self.headers)    res.encoding= 'utf-8'    html = res.text    parseHtml = etree.HTML(html)    # 2. 解析对象调用xpath    r1 = parseHtml.xpath('//div[@class="t_con cleafix"]/div/div/div/a/@href')    for t in r1:      self.getImaUrl(t)       #获取帖子中图片的连接  def getImaUrl(self,t):    res = requests.get(self.baseurl t,headers=self.headers)    res.encoding= 'utf-8'    html = res.text    parseHtml = etree.HTML(html)    # 2. 解析对象调用xpath. 两个解析式不确定, 第一个不能用就换第二个    r2 = parseHtml.xpath('//div[@class="d_post_content j_d_post_content clearfix"]/img[@class="BDE_Image"]/@src')    #r2 = parseHtml.xpath('//div[@class="d_post_content j_d_post_content "]/img[@class="BDE_Image"]/@src')    for i in r2:      self.saveImage(i)     #保存图片到本地  def saveImage(self,i):    print(i)    res = requests.get(i,headers=self.headers)    html = res.content    # 这里自定义图片保存路径, 我是在d盘创建了一个image文件夹用来保存, 图片名就用time.time来定义了    with open('D:\image\%s.jpg'% time.time(),'wb') as f:      f.write(html)   def workOn(self):    key = input('请输入要爬取的贴吧名:')    pn = int(input('请输入要爬取的页数:'))    for x in range(1,pn 1):      url = self.url  'kw='  key '&pn='  str((x-1)*50)      print('================')      print(url)      self.getPageUrl(url)   if __name__=='__main__':  spider = Baiduspider()  spider.workOn()
import requestsfrom lxml import etreeimport time  class Baiduspider(object):  def __init__(self):    self.baseurl = 'https://tieba.baidu.com/'    self.url = 'https://tieba.baidu.com/f?'    self.headers = {'User-Agent':'Mozilla/5.0'}     #获取帖子链接  def getPageUrl(self,url):    res = requests.get(url,headers=self.headers)    res.encoding= 'utf-8'    html = res.text    parseHtml = etree.HTML(html)    # 2. 解析对象调用xpath    r1 = parseHtml.xpath('//div[@class="t_con cleafix"]/div/div/div/a/@href')    for t in r1:      self.getImaUrl(t)       #获取帖子中图片的连接  def getImaUrl(self,t):    res = requests.get(self.baseurl t,headers=self.headers)    res.encoding= 'utf-8'    html = res.text    parseHtml = etree.HTML(html)    # 2. 解析对象调用xpath. 两个解析式不确定, 第一个不能用就换第二个    r2 = parseHtml.xpath('//div[@class="d_post_content j_d_post_content clearfix"]/img[@class="BDE_Image"]/@src')    #r2 = parseHtml.xpath('//div[@class="d_post_content j_d_post_content "]/img[@class="BDE_Image"]/@src')    for i in r2:      self.saveImage(i)     #保存图片到本地  def saveImage(self,i):    print(i)    res = requests.get(i,headers=self.headers)    html = res.content    # 这里自定义图片保存路径, 我是在d盘创建了一个image文件夹用来保存, 图片名就用time.time来定义了    with open('D:\image\%s.jpg'% time.time(),'wb') as f:      f.write(html)   def workOn(self):    key = input('请输入要爬取的贴吧名:')    pn = int(input('请输入要爬取的页数:'))    for x in range(1,pn 1):      url = self.url  'kw='  key '&pn='  str((x-1)*50)      print('================')      print(url)      self.getPageUrl(url)   if __name__=='__main__':  spider = Baiduspider()  spider.workOn()

 
  
					
				
评论