0.Workflow
1.Create a project with pycharm community version 2.Get channel list 3.Get each channel’s product links 4.Get each product’s detailed info
1.Create a project
2. Get channel list
2.1 check css select path
from bs4 import BeautifulSoup
import requests
start_url ='http://jn.58.com/sale.shtml'
def get_channel_urls(url):
wb_data = requests.get(url)
soup = BeautifulSoup(wb_data.text, 'lxml')
links = soup.select('ul.ym-submnu > li > b > a')
print(links)
get_channel_urls(start_url)
2.2 Crawl channel links and save it as a list
from bs4 import BeautifulSoup
import requests
start_url ='http://jn.58.com/sale.shtml'
url_host = 'http://jn.58.com'
def get_channel_urls(url):
wb_data = requests.get(url)
soup = BeautifulSoup(wb_data.text, 'lxml')
links = soup.select('ul.ym-submnu > li > b > a')
for link in links:
page_link = url_host + link.get('href')
print(page_link)
get_channel_urls(start_url)
3. Parse product page and get each product’s link
> parse
vt.从语法上描述或分析(词句等)
[例句]Parse files: files in this filter are parsed for autocomplete and other designers.
分析文件:针对自动完成和其他设计器来对该筛选器中的文件进行分析。
3.1 check css select path
from bs4 import BeautifulSoup
import requests
import time
import pymongo
# client = pymongo.MongoClient('localhost', 27017)
# ceshi_58 = client['ceshi_58']
# product_url_list = ceshi_58['product_url_list']
#spider1
def get_product_links_from(channel, pages, who_sells=0):
# http://jn.58.com/shouji/0/pn2/?islocal=1
link_view = '{}{}/pn{}/?islocal=1'.format(channel,str(who_sells), str(pages))
wb_data = requests.get(link_view)
time.sleep(1)
soup = BeautifulSoup(wb_data.text, 'lxml')
product_link = soup.select('td.t > a.t')
print(product_link)
get_product_links_from('http://jn.58.com/shouji/', 2)
3.2 Get product’s link
from bs4 import BeautifulSoup
import requests
import time
import pymongo
# client = pymongo.MongoClient('localhost', 27017)
# ceshi_58 = client['ceshi_58']
# product_url_list = ceshi_58['product_url_list']
#spider1
def get_product_links_from(channel, pages, who_sells=0):
# http://jn.58.com/shouji/0/pn2/?islocal=1
link_view = '{}{}/pn{}/?islocal=1'.format(channel,str(who_sells), str(pages))
wb_data = requests.get(link_view)
time.sleep(1)
soup = BeautifulSoup(wb_data.text, 'lxml')
product_link = soup.select('td.t > a.t')
for link in product_link:
item_link = link.get('href').split('?')[0]
print(item_link)
get_product_links_from('http://jn.58.com/shouji/', 2)
3.3 Save products’ links to Mongodb
Let’s make some changes on the code to be running more efficiently.
3.3.1 we should let the code to know if there is no such page, just pass. eg. page200
use soup.find() to check whether the page has <div class="noinfotishi">
BeautifulSoup’s find() may a little different from select()
we can just input tab div and class noinfotishi with , divided to find the css selector path,like find(‘div’, ‘noinfotishi’)
if use select(), we need to use select(‘div.nothfotishi’)
soup = BeautifulSoup(wb_data.text, 'lxml')
if soup.find('td','t'):
print('info')
else:
print('noinfo')
or
soup = BeautifulSoup(wb_data.text, 'lxml')
if soup.find('div', 'noinfotishi'):
pass
else:
for link in soup.select('td.t a.t'):
item_link = link.get('href').split('?')[0]
product_url_list.insert_one({'url':item_link})
3.3.2 we need to ignore the first several advs by its link that contains ‘jump’
for link in soup.select('td.t a.t'):
item_link = link.get('href').split('?')[0]
if 'jump' in item_link.split('/'):
pass
else:
product_url_list1.insert_one({'url':item_link})
3.3.3 Before run page_parsing.py, run mongodb service first.
mongod
3.3.4 page_parsing.py
from bs4 import BeautifulSoup
import requests
import time
import pymongo
client = pymongo.MongoClient('localhost', 27017)
ceshi_58 = client['ceshi_58']
product_url_list = ceshi_58['product_url_list']
#spider1
def get_product_links_from(channel, pages, who_sells=0):
# http://jn.58.com/shouji/0/pn2/?islocal=1
link_view = '{}{}/pn{}/?islocal=1'.format(channel,str(who_sells), str(pages))
wb_data = requests.get(link_view)
time.sleep(1)
soup = BeautifulSoup(wb_data.text, 'lxml')
if soup.find('div', 'noinfotishi'):
pass
else:
for link in soup.select('td.t a.t'):
item_link = link.get('href').split('?')[0]
if 'jump' in item_link.split('/'):
pass
else:
product_url_list1.insert_one({'url':item_link})
get_product_links_from('http://jn.58.com/shouji/', 2)
4 Save item info to Mongodb
4.1 if the item are sould out, ignore it cuz no info on that page
eg: http://zhuanzhuan.58.com/detail/938627420277926918z.shtml
4.2 get info in the tag
link = soup.select('ul.ym-submnu > li > b > a').get('href')
price = soup.select('span.price_now > i')[0].text
area = list(soup.select('div.palce_li > span > i')[0].stripped_strings)
4.3 if the tag is in this page then get it otherwise ignore
area = list(soup.select('div.palce_li > span > i')[0].stripped_strings) if soup.find_all('div', 'palce_li') else None
def get_item_info(url):
wb_data = requests.get(url)
soup = BeautifulSoup(wb_data.text, 'lxml')
no_longer_exist = soup.select('span.soldout_btn')
if no_longer_exist:
pass
else:
title = soup.select('h1')[0].text
price = soup.select('span.price_now > i')[0].text
area = list(soup.select('div.palce_li > span > i')[0].stripped_strings) if soup.find_all('div', 'palce_li') else None
item_info.insert_one({'title':title, 'price':price, 'area':area})
get_item_info('http://zhuanzhuan.58.com/detail/948479863277731847z.shtml')