话不多说 直接上代码
注意:
Python
版本Python 3.4+
,lxml
是用来解析html
内容的包,需要单独安装,安装命令pip install numpy
,直接在系统shell
中输入即可。代码中有较详细的注释,有问题的可以在文章下方评论区告诉我。
import urllib
from urllib.request import urlopen, Request
from lxml import etree
import os
# 获取网页内容
def getHtmlByUrl(url):
headers = {
# 这里可修改为你自己浏览器的UA
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36',
}
req = Request(url, headers=headers)
res = urlopen(req)
if res.getcode() == 200:
return res.read().decode('utf-8')
else:
print("获取网站 %s 信息失败" % url)
# 获取图片后缀
def getImgSuffix(imgUrl):
# 去除参数
if imgUrl.find('?') != -1:
imgUrl = imgUrl[0:imgUrl.find('?')]
if imgUrl.find('!') != -1:
imgUrl = imgUrl[0:imgUrl.find('!')]
suffix = imgUrl[imgUrl.rfind('.'):]
return suffix
# 修改为你想获取的连接
htmlStr = getHtmlByUrl('http://www.itmtr.cn/')
print("网页内容:", htmlStr, "\n")
html = etree.HTML(htmlStr)
# 获取图片标签
imgs = html.xpath('//img/@src')
print('共有图片', len(imgs), '个,开始下载...')
# 设置图片下载路径
basePath = os.getcwd() + '\\imgs'
if not os.path.exists(basePath):
os.makedirs(basePath)
print('下载图片至:', basePath)
i = 0
for img in imgs:
if img:
if img.startswith('//'):
img = 'http:' + img
print('\n图片链接', img)
try:
imgRes = urlopen(img)
if imgRes.getcode() == 200:
suffix = getImgSuffix(img)
path = basePath + '\\' + str(i) + suffix
print('图片路径', path)
# 保存文件
file = open(path, 'wb')
file.write(imgRes.read())
file.close()
i += 1
except Exception as e:
print('图片下载失败:', e)
print("\n图片下载完成,共 %d 个,下载成功 %d 个" % (len(imgs), i))