使用selenium + lxml自动爬取engoo daily-reading articles并写入文件

engoo是一个学习英语的网站,通过阅读每日新闻,回答问题可以提高你的英语语感,与人交流时对不同的话题有更多的见解 我每天都会阅读两篇文章,然后制作成markdown记录下来,但是日复一日的拷贝文章,复制问题真的非常的麻烦。所以这里我决定使用脚本将它们爬取下来,这样我只要每天运行一次将获取到的文件拷贝到markdown里即可。
刚开始我准备使用requests + lxml直接爬取文章的,但是我发现该网站使用 google tags manager,使用requests的话完全拿不到数据。最终使用selenium + lxml

requirements.txt

lxml==4.8.0
selenium==4.1.0

导库

import datetime
import time
from selenium import webdriver
from lxml import etree
# from selenium.webdriver.chrome.options import Options

代码

ch_option = Options()
# 该网站使用无头浏览器会拿不到数据
# ch_option.add_argument("--headless")

driver = webdriver.Chrome(options=ch_option)
driver.get("https://engoo.com/app/daily-news")

time.sleep(5)
page_source = driver.page_source
tree = etree.HTML(page_source)
false_urls = tree.xpath('//*[@id="content"]/div[1]/div[1]/div[2]/div/div[2]/a/@href')
article = """"""
for index, url in enumerate(false_urls):
    true_url = "https://engoo.com" + url
    # print(true_url)
    driver.get(true_url)
    time.sleep(5)
    # print(driver.page_source)
    tree = etree.HTML(driver.page_source)
    title = tree.xpath('//*[@id="windowexercise-2"]/div/div/div/div[3]/div/div[1]/div/div/span/span/span/text()')[0]
    contents_list = tree.xpath('//*[@id="windowexercise-2"]/div/div/div/div[3]/div/p/span/span/span/text()')
    contents = """"""
    for p in contents_list:
        contents = contents + "\n" + f"<p>{p}</p>"

    # print(contents)
    session_name = tree.xpath(
        '//*[@id="windowexercise-3"]/div/div/div/div[1]/div[2]/div/span[1]/span/span/span/text()'
    )[0]
    # print(session_name)
    questions = """"""
    discussion = """"""
    further_discussion = """"""
    if session_name == "Questions":
        questions_list = tree.xpath(
            '//*[@id="windowexercise-3"]/div/div/div/div[3]/div/div/div/div[1]/div/div[2]/div/div/div/span/span/span/text()'
        )
        for q in questions_list:
            questions = questions + "\n" + f"+ {q}   \n<font style='color:green'></font>\n"

        discussion_list = tree.xpath(
            '//*[@id="windowexercise-4"]/div/div/div/div[3]/div/div/div/div[1]/div/div[2]/div/div/div/span/span/span/text()'
        )
        for d in discussion_list:
            discussion = discussion + "\n" + f"+ {d}   \n<font style='color:green'></font>\n"

        further_discussion_list = tree.xpath(
            '//*[@id="windowexercise-5"]/div/div/div/div[3]/div/div/div/div[1]/div/div[2]/div/div/div/span/span/span/text()'
        )
        for f in further_discussion_list:
            further_discussion = further_discussion + "\n" + f"+ {f}   \n<font style='color:green'></font>\n"
    else:
        discussion = """"""
        discussion_list = tree.xpath(
            '//*[@id="windowexercise-3"]/div/div/div/div[3]/div/div/div/div[1]/div/div[2]/div/div/div/span/span/span/text()'
        )
        for d in discussion_list:
            discussion = discussion + "\n" + f"+ {d}   \n<font style='color:green'></font>\n"
        further_discussion = """"""
        further_discussion_list = tree.xpath(
            '//*[@id="windowexercise-4"]/div/div/div/div[3]/div/div/div/div[1]/div/div[2]/div/div/div/span/span/span/text()'
        )
        for f in further_discussion_list:
            further_discussion = further_discussion + "\n" + f"+ {f}   \n<font style='color:green'></font>\n"

    article1 = f"""## Article{index + 1}\n\n**[{title}]({true_url})**\n{contents}\n\n**Question**\n{questions}\n\n**Discussion**\n {discussion}\n\n**Further discussion**\n{further_discussion}
        """
    article = article + article1 + "\n"

# print(article)
with open('daily-read.txt', 'w', encoding='utf-8') as fp:
    print("正在写入...")
    fp.write(f'# {datetime.datetime.now().strftime("%d/%m/%Y")}\n' + article)
driver.quit()

最后得到的txt文件符合mardown格式,每天我只需要将txt中的文本复制到typora中即可。简单快捷