1、爬网页
# 简单的网络爬虫 from urllib import request import chardet response = request.urlopen("http://www.jianshu.com/") html = response.read() charset = chardet.detect(html)# {'language': '', 'encoding': 'utf-8', 'confidence': 0.99} html = html.decode(str(charset["encoding"])) # 解码 print(html)
2、Python3爬取网页里的图片并把图片保存到本地文件夹
import re import urllib.request #爬取网页html def getHtml(url): page = urllib.request.urlopen(url) html = page.read() return html html = getHtml("http://tieba.baidu.com/p/3205263090") html = html.decode('UTF-8') #获取图片链接的方法 def getImg(html): # 利用正则表达式匹配网页里的图片地址 reg = r'src="([.*\S]*\.jpg)" pic_ext="jpeg"' imgre=re.compile(reg) imglist=re.findall(imgre,html) return imglist imgList=getImg(html) imgCount=0 #for把获取到的图片都下载到本地pic文件夹里,保存之前先在本地建一个pic文件夹 for imgPath in imgList: f=open("../pic/"+str(imgCount)+".jpg",'wb') f.write((urllib.request.urlopen(imgPath)).read()) f.close() imgCount+=1 print("全部抓取完成")
3、爬取小姐姐图片
import requests import re import time import os # 模仿浏览器访问 headers = { 'user-agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36' } #网站地址 response = requests.get('https://www.vmgirls.com/13138.html',headers=headers) html = response.text #提取图片标题 dir_name = re.findall('<h1 class="post-title h3">(.*?)</h1>',html)[-1] #判断文件夹是否存在 if not os.path.exists(dir_name): os.mkdir(dir_name) #爬取图片地址 <a href="https://static.vmgirls.com/image/2019/12/2019122209234029-scaled.jpeg" alt="初恋粉色系" title="初恋粉色系"> urls = re.findall('<a href="(.*?)" alt=".*?" title=".*?">',html) print(urls) for url in urls: #提取图片名字 file_name = url.split('/')[-1] response = requests.get(url,headers=headers) #新建文件夹以图片标题命名,并保存图片 with open(dir_name + '/' + file_name,'wb')as f: f.write(response.content)
4、爬取各地大学名单
from urllib.request import urlopen#用于获取网页 from bs4 import BeautifulSoup#用于解析网页 html = urlopen('http://www.hao123.com/edu')#爬取带大学url的页面 bsObj = BeautifulSoup(html, 'html.parser') t1 = bsObj.find_all('a') for t2 in t1: t3 = t2.get('href') print(t3) 爬取以上中的具体院校信息 import re import urllib.request for line in open("1.txt",'rt'): res = urllib.request.urlopen(line) html = res.read().decode('gb2312') rule = re.findall(r"<p> <a.*?href=.*?<\/a></p></td>", html, re.I|re.S|re.M) for value in rule: with open('edu-urls.txt', 'a') as f: print(value.strip('<p> '),file=f) print("it's ok") 得到最后的院校信息edu-urls.txt,一共2865条信息。