import requests import os import json from lxml import etree headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)' 'Chrome/104.0.0.0 Safari/537.36' } domain = 'https://so.gushiwen.cn' final_result = [] def get_detail_href(url): # 获取每首诗的href值 res = requests.get(url) et = etree.HTML(res.text) hrefs = et.xpath('//*[@id="html"]/body/div[2]/div[1]/div[2]//span/a/@href') # 处理href,需要添加域名 new_hrefs = [] for href in hrefs: new_hrefs.append(domain + href) return new_hrefs def get_poem_detail(href): # 访问每一个详情页,拿到数据 aspx_file_name = href.split("_")[1].split('.')[0] response_text = requests.get(url=href, headers=headers, timeout=15).text tree = etree.HTML(response_text) result = {} title = tree.xpath('//*[@id="sonsyuanwen"]/div[1]/h1/text()') result["title"] = "".join(title).strip() author = tree.xpath('//*[@id="sonsyuanwen"]/div[1]/p/a/text()') result["author"] = "".join(author).strip() content = tree.xpath(f'//*[@id="contson{aspx_file_name}"]/text()') content = "\n".join(content) result["content"] = "".join(content).strip() author_info = '' result["author_info"] = "".join(author_info).strip() comment = tree.xpath('//*[@id="html"]/body/div[2]/div[1]/div[3]/div[1]/p[2]/text()') if '注释' in comment: result["comment"] = "".append(comment.remove('注释')).strip() else: result["comment"] = "".join(comment).strip() translation = tree.xpath('//*[@id="html"]/body/div[2]/div[1]/div[3]/div[1]/p[1]/text()') if '译文' in translation: result["translation"] = "".join(translation.remove('译文')).strip() else: result["translation"] = "".join(translation).strip() appreciation = '' result["appreciation"] = "".join(appreciation).strip() extention = '' result["extention"] = "".join(extention).strip() reading = '' result["reading"] = "".join(reading).strip() final_result.append(result) def main(): url = 'https://so.gushiwen.cn/gushi/tangshi.aspx' hrefs = get_detail_href(url) for href in hrefs: # 访问每一个详情页,得到每个详情页的内容 get_poem_detail(href) filename = "tangshi/"+".json" with open(filename, "w", encoding='utf-8') as file: file.write(json.dumps(final_result, indent=2, ensure_ascii=False)) if __name__ == '__main__': main()