1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677 |
- import requests
- import os
- import json
- from lxml import etree
- headers = {
- 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
- 'Chrome/104.0.0.0 Safari/537.36'
- }
- domain = 'https://so.gushiwen.cn'
- final_result = []
- def get_detail_href(url):
- # 获取每首诗的href值
- res = requests.get(url)
- et = etree.HTML(res.text)
- hrefs = et.xpath('//*[@id="html"]/body/div[2]/div[1]/div[2]//span/a/@href')
- # 处理href,需要添加域名
- new_hrefs = []
- for href in hrefs:
- new_hrefs.append(domain + href)
- return new_hrefs
- def get_poem_detail(href):
- # 访问每一个详情页,拿到数据
- aspx_file_name = href.split("_")[1].split('.')[0]
- response_text = requests.get(url=href, headers=headers, timeout=15).text
- tree = etree.HTML(response_text)
- result = {}
- title = tree.xpath('//*[@id="sonsyuanwen"]/div[1]/h1/text()')
- result["title"] = "".join(title).strip()
- author = tree.xpath('//*[@id="sonsyuanwen"]/div[1]/p/a/text()')
- result["author"] = "".join(author).strip()
- content = tree.xpath(f'//*[@id="contson{aspx_file_name}"]/text()')
- content = "\n".join(content)
- result["content"] = "".join(content).strip()
- author_info = ''
- result["author_info"] = "".join(author_info).strip()
- comment = tree.xpath('//*[@id="html"]/body/div[2]/div[1]/div[3]/div[1]/p[2]/text()')
- if '注释' in comment:
- result["comment"] = "".append(comment.remove('注释')).strip()
- else:
- result["comment"] = "".join(comment).strip()
- translation = tree.xpath('//*[@id="html"]/body/div[2]/div[1]/div[3]/div[1]/p[1]/text()')
- if '译文' in translation:
- result["translation"] = "".join(translation.remove('译文')).strip()
- else:
- result["translation"] = "".join(translation).strip()
- appreciation = ''
- result["appreciation"] = "".join(appreciation).strip()
- extention = ''
- result["extention"] = "".join(extention).strip()
- reading = ''
- result["reading"] = "".join(reading).strip()
- final_result.append(result)
- def main():
- url = 'https://so.gushiwen.cn/gushi/tangshi.aspx'
- hrefs = get_detail_href(url)
- for href in hrefs:
- # 访问每一个详情页,得到每个详情页的内容
- get_poem_detail(href)
- filename = "tangshi/"+".json"
- with open(filename, "w", encoding='utf-8') as file:
- file.write(json.dumps(final_result, indent=2, ensure_ascii=False))
- if __name__ == '__main__':
- main()
|