本文共 3195 字,大约阅读时间需要 10 分钟。
拉钩重写是实现数据对接和统一化的重要步骤,以下是实现拉钩重写的详细思路和方法。
Scrapy + Selenium结合使用
Scrapy Spider实现
关键字处理
网页请求
网页解析
数据存储
from pypinyin import lazy_pinyin2pinyin = lazy_pinyin2("南京") # 返回列表 ['nan', 'jing']print(pinyin[0]) # 输出 'nan'print(pinyin[1]) # 输出 'jing'print(pinyin[0] + pinyin[1]) # 输出 'nanjing' from scrapy import Spiderfrom selenium import webdriverfrom selenium.webdriver.chrome.options import Optionsfrom time import sleepfrom pypinyin import lazy_pinyin2from lxml import etreefrom Tztalent.items import TztalentItemclass LagouproSpider(Spider): name = 'lagoupro' def __init__(self, table_name, keyword, site, webhook): super(LagouproSpider, self).__init__() # 设置浏览器选项,避免被检测 options = Options() options.add_experimental_option("excludeSwitches", ["enable-automation"]) options.add_experimental_option('useAutomationExtension', False) self.driver = webdriver.Chrome(options=options) # 运行脚本,禁用浏览器自动化检测 self.driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", { "source": """Object.defineProperty(navigator, 'webdriver', { get: () => undefined })""" }) self.keyword = keyword self.webhook_url = webhook self.table_name = table_name # 转换站点为拼音 pinyin = lazy_pinyin2(site) self.site = pinyin[0] + pinyin[1] # 生成起始 URL self.start_urls = [f"https://www.lagou.com/{self.site}-zhaopin/"] def parse(self, response): try: # 找到搜索关键字框,输入关键字 self.driver.find_element_by_id("keyword").send_keys(self.keyword) # 模拟点击搜索按钮 submit = self.driver.find_element_by_id("submit") ActionChains(self.driver).move_to_element(submit).perform() sleep(2) ActionChains(self.driver).click(submit).perform() sleep(2) # 获取网页源码 str_html = self.driver.page_source html = etree.HTML(str_html) # 提取 job list 列表 job_list = html.xpath("//ul[@class='item_con_list']/li") for job in job_list: item = TztalentItem() # 提取标题 item['title'] = job.xpath(".//h3/text()")[0] # 提取公司名称和链接 company_info = job.xpath(".//div[@class='company_name']/a") item['company_name'] = company_info[0].text item['company_url'] = company_info[0].get('href') # 提取地点 location = job.xpath(".//span[@class='add']/em/text()")[0] item['site'] = location yield item except Exception as e: print(f"Error: {e}") print('没有数据') 通过上述方法,可以实现对关键字的汉字转字母,并生成正确的URL,成功获取网页内容,提取所需信息。最终数据将存储到数据库中,供后续模板处理使用。
转载地址:http://hfcbz.baihongyu.com/