Web Scraping, Description

itsjustmyemail · April 25, 2023, 9:09pm

Would like this to print the job description for each job but it seems like it doesn’t find it:

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import time

# Booting up chrome
chrome_options = Options()
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome(options=chrome_options)

# Pointing to Google Search and navigating there
url = f'https://www.google.com/search?q=jobs+"director"+OR+"consultant"+OR+"analyst"+AND+"improvement"+OR+"change"+OR+"innovation"+OR+"power+platform"+OR+"implementation"+AND+Calgary'
driver.get(url)

# Find and click the "Jobs" Area
location_button = driver.find_element(By.ID, 'fMGJ3e')
location_button.click()

# Find all the list items
list_items = driver.find_elements(By.CSS_SELECTOR, 'ul > li')

hrefs = []

for item in list_items:
    job_name = item.text
    print(job_name + ":\n")
    item.click()

    # Wait for the page to load and check for another unordered list
    time.sleep(2)
    while True:
        raw_html = driver.find_element(By.XPATH, '/html/body/div[2]/div/div[2]/div[1]/div/div/div[3]/div[2]/div/div[1]/div/div/g-scrolling-carousel/div[1]/div')
        html_source = raw_html.get_attribute("innerHTML")
        soup = BeautifulSoup(html_source, "html.parser")
        next_ul = soup.find_next("ul")

        if next_ul is None:
            break

        # Scroll down to load the next unordered list
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)

    # Extract links from the current unordered list
    links = soup.find_all("a", href=True)

    # Create a list of links for the current list item
    item_links = []
    for link in links:
        href = link.get("href")
        if href and (href.startswith("http") or href.startswith("https")) and "job" in href:
            item_links.append(href)
    
    # Prioritize links and print the first one that matches the criteria
    found_link = False
    for link in item_links:
        if "linkedin" in link:
            print(link)
            found_link = True
            break
        elif "indeed" in link:
            print(link)
            found_link = True
            break
        elif "ziprecruiter" in link:
            print(link)
            found_link = True
            break
        else:
            print(link)

    # Find the job description span element
    try:
        job_description_element = driver.find_element(By.XPATH, '//*[@id="gws-plugins-horizon-jobs__job_details_page"]/div/div[4]/span/text()')
        job_description_text = job_description_element.text
        print(job_description_text)
    except:
        print("Job description not found.")
            
    # Append all links for the current list item to the global list
    hrefs.extend(item_links)
    print("\n")

SMCoder775 · October 31, 2023, 2:23am

The code you provided on line 79 tries to locate the job description using the xpath '//*[@id="gws-plugins-horizon-jobs__job_details_page"]/div/div[4]/span/text()'. However, it seems that this xpath might not be correct or the job description element might not be present in the HTML structure.

To troubleshoot this issue, you can try inspecting the HTML of the job details page manually to identify the correct xpath or CSS selector for the job description element. Make sure to check if the desired job description is contained within a specific HTML tag or class that you can target.

Once you have identified the correct selector, update the code on line 79 with the appropriate xpath or selector to extract the job description text.

(written by ghostwriter btw
ghostwriter is a great resource, so don’t feel ashamed to use it