poem2.py 2.6 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879
  1. import requests
  2. import os
  3. import json
  4. from lxml import etree
  5. headers = {
  6. 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
  7. 'Chrome/104.0.0.0 Safari/537.36'
  8. }
  9. domain = 'https://so.gushiwen.cn'
  10. final_result = []
  11. def get_detail_href(url):
  12. # 获取每首诗的href值
  13. res = requests.get(url)
  14. et = etree.HTML(res.text)
  15. hrefs = et.xpath('//*[@id="html"]/body/div[2]/div[1]/div[2]//span/a/@href')
  16. # 处理href,需要添加域名
  17. new_hrefs = []
  18. for href in hrefs:
  19. new_hrefs.append(domain + href)
  20. return new_hrefs
  21. def get_poem_detail(href):
  22. # 访问每一个详情页,拿到数据
  23. aspx_file_name = href.split("_")[1].split('.')[0]
  24. response_text = requests.get(url=href, headers=headers, timeout=15).text
  25. tree = etree.HTML(response_text)
  26. result = {}
  27. title = tree.xpath('//*[@id="sonsyuanwen"]/div[1]/h1/text()')
  28. result["title"] = "".join(title).strip()
  29. author = tree.xpath('//*[@id="sonsyuanwen"]/div[1]/p/a/text()')
  30. result["author"] = "".join(author).strip()
  31. content = tree.xpath(f'//*[@id="contson{aspx_file_name}"]/text()')
  32. content = "\n".join(content)
  33. result["content"] = "".join(content).strip()
  34. author_info = ''
  35. result["author_info"] = "".join(author_info).strip()
  36. comment = tree.xpath('/html/body/div[2]/div[1]/div[3]/div/p[2]/text()[2]')
  37. result["comment"] = "".join(comment).strip().lstrip('注释')
  38. translation = tree.xpath('/html/body/div[2]/div[1]/div[3]/div/p[1]/text()')
  39. result["translation"] = "".join(translation).strip().lstrip('译文')
  40. appreciation = ''
  41. result["appreciation"] = "".join(appreciation).strip()
  42. extention = ''
  43. result["extention"] = "".join(extention).strip()
  44. reading = ''
  45. result["reading"] = "".join(reading).strip()
  46. final_result.append(result)
  47. def main():
  48. url = 'https://so.gushiwen.cn/gushi/songsan.aspx'
  49. hrefs = get_detail_href(url)
  50. for href in hrefs:
  51. # 访问每一个详情页,得到每个详情页的内容
  52. get_poem_detail(href)
  53. filename = "songci/" + ".json"
  54. with open(filename, "w", encoding='utf-8') as file:
  55. file.write(json.dumps(final_result, indent=2, ensure_ascii=False))
  56. if __name__ == '__main__':
  57. main()
  58. # //*[@id="contsonf4c976914347"]
  59. # //*[@id="contson169ff9fbafcb"]
  60. # //*[@id="contsond7142f5463fa"]
  61. # //*[@id="contsona17364b0318f"]
  62. # //*[@id="fanyi1208"]/div/p[1]
  63. # //*[@id="fanyi1208"]/div/p[2]/text()[2]
  64. # //*[@id="fanyi1208"]/div/p[2]/text()[2]