from lxml import etree tree = etree.parse("1.html") result = tree.xpath("/html/body/ul/li/a/@href") print(result) result = tree.xpath("/html/body/ul/li") for li in result: print(li.xpath("./a/@href")) # 局部解析 result = tree.xpath("//div[@class='job']/text()") # [@class='xxx']属性选取 text()获取⽂本 print(result)
实战案例:
一、58二手房标题
1 2 3 4 5 6 7 8 9 10 11 12
#!/usr/bin/env python # -*- coding:utf-8 -*- from lxml import etree import requests if __name__ == "__main__": headers ={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3884.400 QQBrowser/10.8.4560.400'} url = "https://bj.58.com/ershoufang/" page_text = requests.get(url,headers = headers).text tree = etree.HTML(page_text) titles = tree.xpath("//h3/text()") for title in titles: print(title)
#!/usr/bin/env python # -*- coding:utf-8 -*- from lxml import etree import requests import os if __name__ == "__main__": headers ={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3884.400 QQBrowser/10.8.4560.400'} url = "https://www.aqistudy.cn/historydata/" page_text = requests.get(url,headers = headers).text
tree = etree.HTML(page_text) #全部城市 all_city_list = tree.xpath('//div[@class="bottom"]//li') all_city_names = [] for li in all_city_list: city_name = li.xpath("./a/text()")[0] all_city_names.append(city_name) for city in all_city_names: print(city) # print(all_city_names)
#热门城市 # hot_city_names = [] # hot_city_list = tree.xpath('//div[@class="hot"]//li') # for li in hot_city_list: # city_name = li.xpath("./a/text()")[0] # # hot_city_names.append(city_name) # print(hot_city_names)