<!DOCTYPE html> <html lang="en"> <head> <meta charset="UTF-8"> <title>图片和超链接</title> </head> <body> <!--图片--> <img src="岳飞.jpg" alt="岳飞" width="200"> <hr> <!--超链接--> <a href="https://www.baidu.com/">百度</a> <hr> <a href="https://baike.baidu.com/item/%E5%B2%B3%E9%A3%9E/127844?fr=aladdin"> <img src="岳飞.jpg" alt="岳飞" width="200"></a> </body> </html>
import requests # 解析网页 from lxml import etree # 将爬虫伪装成浏览器 headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'} # 使用get方法发出请求,然后接受响应 r = requests.get('https://www.baidu.com',headers=headers) # 设置编码集 r.encoding = 'utf-8' # 将字符串格式的网页转变为etree格式 selector = etree.HTML(r.text) # 获取图片地址 href = selector.xpath('//*[@id="s_lg_img"]/@src')[0] # print(r.text) # 下载图片 response = requests.get('https:'+href) with open('logo.png','wb') as file: file.write(response.content) # # response = requests.get('https://www.baidu.com/img/PCtm_d9c8750bed0b3c7d089fa7d55720d6cf.png') # with open('logo.png','wb') as file: # file.write(response.content) # print(response.content)
import requests from lxml import etree with open(r'D:\worksapce\python\day10\sample.html','r') as file: page = file.read() selector = etree.HTML(page) ul = selector.xpath('//div/ul') print(ul) # 获取所有的li元素 lis = selector.xpath('//div/ul/li') print(lis) # 多个同名元素编号从1开始 li2 = selector.xpath('//div/ul/li[2]') print(li2) # 通过 class 定位,获取第三个 li3 = selector.xpath('//div/ul/li[@class="item-inactive"]') print(li3) # 获取所有 a ass = selector.xpath('//div/ul/li/a') print(ass) # 获取第四个 a a4 = selector.xpath('//div/ul/li/a[@href="link4.html"]') print(a4) # 通过(文本text) fourth item 获取 a4 = selector.xpath('//*[text()="fourth item"]') print(a4)
|