<!DOCTYPE html> <html lang="en"> <head> <meta charset="UTF-8"> <title>文本处理</title> </head> <body> <!-- 标题 --> <h1>侧开40班</h1> <h3>今天是个好天气!</h3> <!-- 段落 --> <p>我爱你中国自妮子的捎带的VS山东省规划的就是那块明明是经济纠纷内需重卡卡旺卡</p> <p>我爱你中国自妮明明是经济纠纷内需重卡卡旺卡</p> <!-- 列表 --> <!-- 有序列表 --> <ol> <li> 美国 </ol> <!-- 无序列表 --> <ul> <li> 米 </li> <li>面</li> <li>油</li> </ul> </body> </html> <!DOCTYPE html> <html lang="en"> <head> <meta charset="UTF-8"> <title>分区元素</title> <style> div { border: 1px solid red; } p { color: red; } .abc { color:blue; } </style> </head> <body> <!-- 块元素:独占一行 --> <div>马上吃饭了</div> <div>好饿啊</div> <!-- 行内元素:不独占一行 --> <span>米饭</span> <span>鸡腿</span> <span>青菜</span> <p>今天是个<span class="abc">好</span>日子</p> </body> </html> <!DOCTYPE html> <html lang="en"> <head> <meta charset="UTF-8"> <title>图片和超链接</title> </head> <body> <img src="岳飞.jpg" alt="岳飞" width="200"> <hr> <!-- 超链接 --> <a href="https://www.baidu.com">百度</a> <hr> <a href="https://baike.baidu.com/item/%E5%B2%B3%E9%A3%9E/127844?fr=aladdin"><img src="岳飞.jpg" alt="岳飞" width="200"></a> </body> </html> <!DOCTYPE html> <html lang="en"> <head> <meta charset="UTF-8"> <title>表格</title> </head> <body> <table border="1" cellspacing="0" cellpadding="0"> <!-- 行 --> <tr> <!-- 列 --> <td>aaa</td> <td>bbb</td> <td>ccc</td> </tr> <tr> <td>ddd</td> <td>eee</td> <td>fff</td> </tr> <tr> <td colspan="3">ggg</td> </tr> </table> </body> </html> <!DOCTYPE html> <html lang="en"> <head> <meta charset="UTF-8"> <title>表单</title> </head> <body> <form action="https://www.baidu.com"> <!-- 文本输入框 --> 账号:<input type="text" name="account"> <br><br> <!-- 密码 --> 密码:<input type="password" name="password"> <br> <input type="submit" value="注册"> <hr> <!-- 单选 --> 性别:<input type="radio" name="gender" id="">男 <input type="radio" name="gender" id="">女 <br><br> <!-- 多选 --> 爱好:<input type="checkbox" name="hobby">游戏 <input type="checkbox" name="hobby">篮球 <input type="checkbox" name="hobby">旅游 <input type="checkbox" name="hobby">美食 <input type="checkbox" name="hobby">乒乓 <br><br> <!-- 下拉选择框 --> 城市: <select name="city" id=""> <option value="0">请选择</option> <option value="1">西安</option> <option value="2">宝鸡</option> <option value="2">汉中</option> </select> </form> </body> </html> import requests from lxml import etree # 将爬虫伪装成浏览器 headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'} # 使用get方法发出请求,然后接收相应 r = requests.get('https://www.baidu.com/',headers=headers) # 设置编码集 r.encoding = 'utf-8' # 将字符串格式的网页转变为etree格式 selector = etree.HTML(r.text) # 获取图片地址 href = selector.xpath('//*[@id="s_lg_img"]/@src')[0] # 下载图片 response = requests.get('https:'+href) with open('logo.png','wb') as file: file.write(response.content) import requests from lxml import etree with open('python\day10\sample.html','r') as file: page = file.read() selector = etree.HTML(page) ul = selector.xpath('//div/ul') print(ul) # 多个同名元素编号从1开始 li2 = selector.xpath('//div/ul/li[2]') print(li2) li3 = selector.xpath('//div/ul/li[@class = "item-inactive"]') print(li3) li4 = selector.xpath('//div/ul/li/a[@href="link4.html"]') print(li4) a4 = selector.xpath('//*[text()="fourth item"]') print(a4)
|