Web Scraping With Python

Building a Product Recommendation System for E-Commerce: Part I — Web Scraping

How I extract data using web scraping with Python during my Data Science Internship at ScoreData

Image for post
Image for post
Image by noshad ahmed from Pixabay

Data Collection

Image for post
Image for post
Screenshot of Inspect (by Author)
#get all the link
link_list = []
options = Options()
options.add_argument("--headless")
driver = webdriver.Chrome('/usr/local/bin/chromedriver',options=options)for url in url_list:
driver.get(url)
soup = BeautifulSoup(driver.page_source, 'lxml')
scroll_to = 0
for i in range(5):
scroll_to += 500
driver.execute_script('window.scrollTo(0, ' + str(scroll_to) + ');')
time.sleep(1.5)
innerHTML = driver.execute_script("return document.body.innerHTML")
atag=soup.findAll('div', attrs={'class' : 'ui-pl-visible-content'})
for link in atag:
product_url=link.find('a')['href']
product_url='https://www.backcountry.com'+ product_url
link_list.append(product_url)
driver.close()
def product_information(url):

product={}
options = Options()
options.add_argument("--headless")
options.headless = True
options.add_argument("--window-size=1920,1200")


driver = webdriver.Chrome('/usr/local/bin/chromedriver',options=options)
# driver = webdriver.Chrome('/usr/local/bin/chromedriver')

driver.get(url)

#scroll down to where the review count is located on the page
scroll_to = 0
for i in range(5):
scroll_to += 300
driver.execute_script('window.scrollTo(0, ' + str(scroll_to) + ');')
time.sleep(1.5)

innerHTML = driver.execute_script("return document.body.innerHTML") #use this if java-rendered page
soup = BeautifulSoup(innerHTML, 'lxml')


# product name
product_name= soup.find('h1', {'class': 'product-name qa-product-title'})

if product_name is None:
product['product_name']=None
else:
product['product_name']=product_name.text

# price
price= soup.find('span', {'class': 'product-pricing__retail'})
if price is None:
price= soup.find('span', {'class': 'product-pricing__sale'}) #product-pricing__sale on sale

if price is None:
product['price']=None
else:
price = price.text
product['price']=price
else:
product['price']=price.text
# product description
# ui-product-details__description
# not all the product has description
product_description= soup.find('div', {'class': 'ui-product-details__description'})

if product_description is None:
product['product_description']=None
else:
product['product_description']=product_description.text

# product info
# prod-details-accordion__list
product_details= soup.find('ul', {'class': 'prod-details-accordion__list'})
product_details = list(product_details.stripped_strings)
product['product_details']=product_details

#tech specs
product_first=soup.find_all('div', {'class': 'ui-product-details__techspec-row'})

if product_first is None:
product['tech_spec']=None
else:
tech_spec={}

#should be able to find rows from the tech spec table
for i in product_first:
tech_name=i.find('dt', {'class': 'ui-product-details__techspec-name'})
tech_name=tech_name.text
tech_value=i.find('dd', {'class': 'ui-product-details__techspec-value'})
tech_value=tech_value.text
tech_spec[tech_name]=tech_valueproduct['tech_spec']=tech_spec


# review_count
review_count= soup.find('span', {'class': 'review-count'})
if review_count is None:
product['review_count']=None
else:
review_count = review_count.text
review_count = int(review_count.split(' ')[0])
product['review_count']=review_count
driver.close()

return product

Create a DataFrame

product_name=[]
brand_name=[]
price=[]
product_description=[]
product_details=[]
tech_spec=[]
review_count=[]
for link in link_list:
product_name.append(product_dict[link]['description']['product_name'])
brand_name.append(product_dict[link]['brand_name'])
price.append(product_dict[link]['description']['price'])
product_description.append(product_dict[link]['description']['product_description'])
product_details.append(product_dict[link]['description']['product_details'])
tech_spec.append(product_dict[link]['description']['tech_spec'])
review_count.append(product_dict[link]['description']['review_count'])

key_list=set()
for idx, spec in enumerate(tech_spec):
for key in spec.keys():
key_list.add(key)
key_list=list(key_list)
#convert all the unique techs into key
key_dictionary = defaultdict(list)
for idx, spec in enumerate(tech_spec):
for key in key_list:
if key not in spec.keys():
key_dictionary[key].append(None)
else:
key_dictionary[key].append(spec[key])

tech=pd.DataFrame.from_dict(key_dictionary)

Conclusion

Written by

I’m passionate about the possibilities that Data Science can enable. I write about what I’ve learned. Never stop learning because life never stops teaching.❤️

Get the Medium app

A button that says 'Download on the App Store', and if clicked it will lead you to the iOS App store
A button that says 'Get it on, Google Play', and if clicked it will lead you to the Google Play store