本帖最后由 Loading... 于 2021-1-28 20:34 编辑
9:00——20:00
课堂笔记 import requests from lxml import etree #将爬虫伪装成浏览器 headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'} #使用get方法发出请求,然后接收响应 r = requests.get('https://www.baidu.com',headers=headers) # r = requests.get('https://www.douban.com',headers=headers) #设置编码集 r.encoding = 'uft-8' #输出网页内容 # print(r.text) #将字符串格式的网页转变为etree格式 selector = etree.HTML(r.text) #获取图片地址 href = selector.xpath('//*[@id="s_lg_img"]/@src')[0] print(href) #下载图片 response = requests.get('https:'+href) # print(response.content) #保存图片 with open('result.png',mode='wb')as picture: picture.write(response.content) import requests from lxml import etree # response = requests.get(r'D:\workspace\python\day10\sample.html') # response.encoding = 'utf-8' # print(response.text) with open('D:\workspace\python\day10\sample.html','r')as file: page = file.read() selector = etree.HTML(page) ul = selector.xpath('//div/ul') print(ul) #获取所有的li元素 # lis = selector.xpath('//div/ul/li') #获取第二个li # li2 = selector.xpath('//div/ul/li')[1] #或 #多个同名元素编号从1开始 li2 = selector.xpath('//div/ul/li[2]') print(li2)
#精确定位 li3 = selector.xpath('//div/ul/li[@class="item-inactive"]') print(li3)
ass = selector.xpath('//div/ul/li/a') print(ass) #绝对路径 a4 = selector.xpath('//div/ul/li/a[@href="link4.html"]') print(a4) #通过文本来定位元素 相对路径 a4 = selector.xpath('//*[text()="fourth item"]') print(a4) |