菁英科技(卓目鸟学苑)- 专注软件测试菁英教育
标题: python_赵梦冰_20210128 [打印本页]
作者: Loading... 时间: 2021-1-28 20:29
标题: python_赵梦冰_20210128
9:00——20:00
课堂笔记
import requests
from lxml import etree
#将爬虫伪装成浏览器
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'}
#使用get方法发出请求,然后接收响应
r = requests.get('https://www.baidu.com',headers=headers)
# r = requests.get('https://www.douban.com',headers=headers)
#设置编码集
r.encoding = 'uft-8'
#输出网页内容
# print(r.text)
#将字符串格式的网页转变为etree格式
selector = etree.HTML(r.text)
#获取图片地址
href = selector.xpath('//*[@id="s_lg_img"]/@src')[0]
print(href)
#下载图片
response = requests.get('https:'+href)
# print(response.content)
#保存图片
with open('result.png',mode='wb')as picture:
picture.write(response.content)
import requests
from lxml import etree
# response = requests.get(r'D:\workspace\python\day10\sample.html')
# response.encoding = 'utf-8'
# print(response.text)
with open('D:\workspace\python\day10\sample.html','r')as file:
page = file.read()
selector = etree.HTML(page)
ul = selector.xpath('//div/ul')
print(ul)
#获取所有的li元素
# lis = selector.xpath('//div/ul/li')
#获取第二个li
# li2 = selector.xpath('//div/ul/li')[1]
#或
#多个同名元素编号从1开始
li2 = selector.xpath('//div/ul/li[2]')
print(li2)
#精确定位
li3 = selector.xpath('//div/ul/li[@class="item-inactive"]')
print(li3)
ass = selector.xpath('//div/ul/li/a')
print(ass)
#绝对路径
a4 = selector.xpath('//div/ul/li/a[@href="link4.html"]')
print(a4)
#通过文本来定位元素 相对路径
a4 = selector.xpath('//*[text()="fourth item"]')
print(a4)
欢迎光临 菁英科技(卓目鸟学苑)- 专注软件测试菁英教育 (http://www.zmnxy.com/) |
Powered by Discuz! X3.4 |