1217756933
/
crawler


			
							12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879
							import requests
import os
import json
from lxml import etree


headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
                  'Chrome/104.0.0.0 Safari/537.36'
}

domain = 'https://so.gushiwen.cn'
final_result = []


def get_detail_href(url):
    # 获取每首诗的href值
    res = requests.get(url)
    et = etree.HTML(res.text)
    hrefs = et.xpath('//*[@id="html"]/body/div[2]/div[1]/div[2]//span/a/@href')
    # 处理href，需要添加域名
    new_hrefs = []
    for href in hrefs:
        new_hrefs.append(domain + href)
    return new_hrefs


def get_poem_detail(href):
    # 访问每一个详情页，拿到数据
    aspx_file_name = href.split("_")[1].split('.')[0]
    response_text = requests.get(url=href, headers=headers, timeout=15).text
    tree = etree.HTML(response_text)
    result = {}
    title = tree.xpath('//*[@id="sonsyuanwen"]/div[1]/h1/text()')
    result["title"] = "".join(title).strip()
    author = tree.xpath('//*[@id="sonsyuanwen"]/div[1]/p/a/text()')
    result["author"] = "".join(author).strip()
    content = tree.xpath(f'//*[@id="contson{aspx_file_name}"]/text()')
    content = "\n".join(content)
    result["content"] = "".join(content).strip()
    author_info = ''
    result["author_info"] = "".join(author_info).strip()
    comment = tree.xpath('/html/body/div[2]/div[1]/div[3]/div/p[2]/text()[2]')
    result["comment"] = "".join(comment).strip().lstrip('注释')
    translation = tree.xpath('/html/body/div[2]/div[1]/div[3]/div/p[1]/text()')
    result["translation"] = "".join(translation).strip().lstrip('译文')
    appreciation = ''
    result["appreciation"] = "".join(appreciation).strip()
    extention = ''
    result["extention"] = "".join(extention).strip()
    reading = ''
    result["reading"] = "".join(reading).strip()
    final_result.append(result)


def main():

    url = 'https://so.gushiwen.cn/gushi/songsan.aspx'
    hrefs = get_detail_href(url)

    for href in hrefs:
        # 访问每一个详情页，得到每个详情页的内容
        get_poem_detail(href)
    filename = "songci/" + ".json"
    with open(filename, "w", encoding='utf-8') as file:
        file.write(json.dumps(final_result, indent=2, ensure_ascii=False))


if __name__ == '__main__':
    main()


# //*[@id="contsonf4c976914347"]
# //*[@id="contson169ff9fbafcb"]
# //*[@id="contsond7142f5463fa"]
# //*[@id="contsona17364b0318f"]
# //*[@id="fanyi1208"]/div/p[1]
# //*[@id="fanyi1208"]/div/p[2]/text()[2]
# //*[@id="fanyi1208"]/div/p[2]/text()[2]