rag-chatbot / Web_Scraper.py
Ipshitaa's picture
Deploying SHL chatbot to Hugging Face Spaces
c71b3b8
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import time
from urllib.parse import urljoin
def scrape_shl_products():
# Configure Chrome options
chrome_options = Options()
chrome_options.add_argument("--headless") # Optional: Run in background
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
chrome_options.add_argument(
"user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
)
# Set up driver
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=chrome_options)
base_url = "https://www.shl.com"
catalog_url = "https://www.shl.com/solutions/products/product-catalog/"
try:
print("Loading SHL product catalog...")
driver.get(catalog_url)
# Wait for products to load
WebDriverWait(driver, 15).until(
EC.presence_of_element_located((By.CSS_SELECTOR, ".product-card"))
)
# Scroll to load all products
print("Scrolling to load all products...")
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(2)
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
product_cards = driver.find_elements(By.CSS_SELECTOR, ".product-card")
print(f"Found {len(product_cards)} products.")
products = []
for card in product_cards:
try:
product = {
'Assessment Name': 'Not found',
'URL': 'Not found',
'Remote Testing Support': 'No',
'Adaptive/IRT Support': 'No',
'Duration': 'Not specified',
'Test Type': 'Not specified'
}
# Name
name_element = card.find_element(By.CSS_SELECTOR, ".product-card__title")
product['Assessment Name'] = name_element.text
# URL
link_element = card.find_element(By.CSS_SELECTOR, "a[href]")
product['URL'] = urljoin(base_url, link_element.get_attribute("href"))
# Metadata
meta_items = card.find_elements(By.CSS_SELECTOR, ".product-card__meta-item")
for item in meta_items:
try:
label = item.find_element(By.CSS_SELECTOR, ".product-card__meta-label").text.lower()
value = item.find_element(By.CSS_SELECTOR, ".product-card__meta-value").text
if 'remote' in label:
product['Remote Testing Support'] = 'Yes' if 'yes' in value.lower() else 'No'
elif 'adaptive' in label or 'irt' in label:
product['Adaptive/IRT Support'] = 'Yes' if 'yes' in value.lower() else 'No'
elif 'duration' in label:
product['Duration'] = value
elif 'type' in label:
product['Test Type'] = value
except NoSuchElementException:
continue
products.append(product)
except Exception as e:
print(f"Error processing a product card: {str(e)}")
continue
# Save data
df = pd.DataFrame(products)
df.to_csv('shl_products.csv', index=False)
print("Data saved to shl_products.csv")
return df
except TimeoutException:
print("Timeout loading the page.")
except Exception as e:
print(f"An error occurred: {str(e)}")
finally:
driver.quit()
print("Browser closed.")
if __name__ == "__main__":
print("Starting SHL scraper...") # Debug print
df = scrape_shl_products()
if df is not None and not df.empty:
print("\nFirst 5 results:")
print(df.head())
else:
print("No data scraped.")