Spaces:
Sleeping
Sleeping
File size: 4,681 Bytes
c71b3b8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 |
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import time
from urllib.parse import urljoin
def scrape_shl_products():
# Configure Chrome options
chrome_options = Options()
chrome_options.add_argument("--headless") # Optional: Run in background
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
chrome_options.add_argument(
"user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
)
# Set up driver
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=chrome_options)
base_url = "https://www.shl.com"
catalog_url = "https://www.shl.com/solutions/products/product-catalog/"
try:
print("Loading SHL product catalog...")
driver.get(catalog_url)
# Wait for products to load
WebDriverWait(driver, 15).until(
EC.presence_of_element_located((By.CSS_SELECTOR, ".product-card"))
)
# Scroll to load all products
print("Scrolling to load all products...")
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(2)
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
product_cards = driver.find_elements(By.CSS_SELECTOR, ".product-card")
print(f"Found {len(product_cards)} products.")
products = []
for card in product_cards:
try:
product = {
'Assessment Name': 'Not found',
'URL': 'Not found',
'Remote Testing Support': 'No',
'Adaptive/IRT Support': 'No',
'Duration': 'Not specified',
'Test Type': 'Not specified'
}
# Name
name_element = card.find_element(By.CSS_SELECTOR, ".product-card__title")
product['Assessment Name'] = name_element.text
# URL
link_element = card.find_element(By.CSS_SELECTOR, "a[href]")
product['URL'] = urljoin(base_url, link_element.get_attribute("href"))
# Metadata
meta_items = card.find_elements(By.CSS_SELECTOR, ".product-card__meta-item")
for item in meta_items:
try:
label = item.find_element(By.CSS_SELECTOR, ".product-card__meta-label").text.lower()
value = item.find_element(By.CSS_SELECTOR, ".product-card__meta-value").text
if 'remote' in label:
product['Remote Testing Support'] = 'Yes' if 'yes' in value.lower() else 'No'
elif 'adaptive' in label or 'irt' in label:
product['Adaptive/IRT Support'] = 'Yes' if 'yes' in value.lower() else 'No'
elif 'duration' in label:
product['Duration'] = value
elif 'type' in label:
product['Test Type'] = value
except NoSuchElementException:
continue
products.append(product)
except Exception as e:
print(f"Error processing a product card: {str(e)}")
continue
# Save data
df = pd.DataFrame(products)
df.to_csv('shl_products.csv', index=False)
print("Data saved to shl_products.csv")
return df
except TimeoutException:
print("Timeout loading the page.")
except Exception as e:
print(f"An error occurred: {str(e)}")
finally:
driver.quit()
print("Browser closed.")
if __name__ == "__main__":
print("Starting SHL scraper...") # Debug print
df = scrape_shl_products()
if df is not None and not df.empty:
print("\nFirst 5 results:")
print(df.head())
else:
print("No data scraped.")
|