python-张荣亮-2021.01.28

张荣亮 · 发表于 2021-1-29 10:00:12

<!DOCTYPE html>

<head>

<title>图片和超链接</title>

</head>

<body>

<hr>

<hr>

</body>

</html>

import requests

# 解析网页

from lxml import etree

# 将爬虫伪装成浏览器

headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'}

# 使用get方法发出请求，然后接受响应

r = requests.get('https://www.baidu.com',headers=headers)

# 设置编码集

r.encoding = 'utf-8'

# 将字符串格式的网页转变为etree格式

selector = etree.HTML(r.text)

# 获取图片地址

href = selector.xpath('//*[@id="s_lg_img"]/@src')[0]

# print(r.text)

# 下载图片

response = requests.get('https:'+href)

with open('logo.png','wb') as file:

file.write(response.content)

#

# response = requests.get('https://www.baidu.com/img/PCtm_d9c8750bed0b3c7d089fa7d55720d6cf.png')

# with open('logo.png','wb') as file:

# file.write(response.content)

# print(response.content)

import requests

from lxml import etree

with open(r'D:\worksapce\python\day10\sample.html','r') as file:

page = file.read()

selector = etree.HTML(page)

ul = selector.xpath('//div/ul')

print(ul)

# 获取所有的li元素

lis = selector.xpath('//div/ul/li')

print(lis)

# 多个同名元素编号从1开始

li2 = selector.xpath('//div/ul/li[2]')

print(li2)

# 通过 class 定位，获取第三个

li3 = selector.xpath('//div/ul/li[@class="item-inactive"]')

print(li3)

# 获取所有 a

ass = selector.xpath('//div/ul/li/a')

print(ass)

# 获取第四个 a

a4 = selector.xpath('//div/ul/li/a[@href="link4.html"]')

print(a4)

# 通过（文本text） fourth item 获取

a4 = selector.xpath('//*[text()="fourth item"]')

print(a4)

啄木鸟学院

亲爱的游客，欢迎！

客服电话

python-张荣亮-2021.01.28

热门课程

社区指南

关于我们

帮助中心