爬蟲程式可以讓我們取出網頁中指定的元素並加以利用
BeautifulSoup是Python的一個解析HTML的模組,方便我們找出HTML元素並加以利用。
因為BPM是以ASP.NET設計,所以在分析Request上需要有一些注意
ASP.NET在傳送Request時,會有幾個特殊的動態參數,這些參數值會依不同的頁面而不同,所以該參數的內容必須是上一頁的值,否則會產生Error
我是以Chrome的開發人員工具來觀察Request與Response,ASP.NET在Request(POST模式)時,會有下列特殊的動態參數:
__EVENTTARGET、__EVENTARGUMENT、__VIEWSTATE、__VIEWSTATEGENERATOR、__VIEWSTATEENCRYPTED、__EVENTVALIDATION
其中__VIEWSTATE、__EVENTVALIDATION的值必需是上一頁Response傳回的值,例如由index.htm -> a.htm -> b.htm
由index.htm按下連結至a.htm後,a.htm除了有HTML之後,還加上__VIEWSTATE、__EVENTVALIDATION的值,
而如果要連結到b.htm時,由a.htm發出的POST Request就必須包含b.htm的__VIEWSTATE、__EVENTVALIDATION的值
import requests
from bs4 import BeautifulSoup
#HttpNtlmAuth模組可以讓Python登入Windows網域
from requests_ntlm import HttpNtlmAuth
import re
from datetime import datetime,timedelta
import sys
#帳號申請單分析
def Account_Apply(soup) :
txtUserNo = str.strip(soup.find('input',{"id":"txtUserNo"}).get("value")) #申請的帳號
txtUserName = str.strip(soup.find('input',{"id":"txtUserName"}).get("value")) #新帳號的姓名
txtPO = str(soup.find('input',{"id":"txtPO"}).get("value")) #單號
PID = str(soup.find('span',{"id":"Label28"}).contents[0]) #案號
txtStartDate = str(soup.find('input',{"id":"txt_StarDate"}).get("value")) #申請日期
return (txtPO + " : " + PID + " : " + txtUserNo + " : " + txtUserName + " : " + txtStartDate)
#Post Data
def Apply_Postdata(post_dict) :
return {
'__EVENTTARGET': post_dict['eventtarget'] ,
'__EVENTARGUMENT': post_dict['eventargument'],
'__VIEWSTATE': post_dict['viewstate'],
'__VIEWSTATEGENERATOR': post_dict['viewstategenerator'],
'__VIEWSTATEENCRYPTED': '',
'__EVENTVALIDATION': post_dict['eventvalidation'],
'ctl00$TopMenu1$cboLanguage': '1',
'ctl00$ContentPlaceHolder1$cboProcessType': '0', #處理種類不限
'ctl00$ContentPlaceHolder1$cboFlowStatus': 'N', #狀態不限
'ctl00$ContentPlaceHolder1$cboFlowid': '8', #帳號申請單
'ctl00$ContentPlaceHolder1$txtSdate': post_dict['Sdate'],
'ctl00$ContentPlaceHolder1$txtEdate': post_dict['Edate'],
'ctl00$ContentPlaceHolder1$txtisMercedesbenz': '0', #隱藏欄位,固定為0
'ctl00$ContentPlaceHolder1$txtStatus': post_dict['status'] , #隱藏欄位
'ctl00$ContentPlaceHolder1$txtmemo': '',
'ctl00$ContentPlaceHolder1$txtActiveid': '',
'ctl00$ContentPlaceHolder1$txtDisplayname': '',
'ctl00$ContentPlaceHolder1$txt_ApplyName': ''
}
#Configure
url_history = 'http://test-bpm/Client/Search/History.aspx'
url_index = 'http://test-bpm/Client/MyPage/index.aspx'
nt_user = ''
nt_passwd = ''
conSdate = ''
conEdate = ''
#取得Windows帳號及密碼
while nt_user is None or nt_user=='':
nt_user = input("帳號(Domain\\UserID): ")
while nt_passwd is None or nt_passwd=='':
nt_passwd = input("密碼(password): ")
while conSdate is None or conSdate=='':
conSdate = input("日期起(yyyy/mm/dd): ")
while conEdate is None or conEdate=='':
conEdate = input("日期迄(yyyy/mm/dd): ")
try:
datetime.strptime(conSdate,'%Y/%m/%d')
datetime.strptime(conEdate,'%Y/%m/%d')
except:
print("\n日期格式錯誤")
sys.exit(0)
#設定一個Http Session,讓下面的步驟都使用同一個Session
session = requests.Session()
session.auth = HttpNtlmAuth(nt_user,nt_passwd)
#因為ASP.net的POST會有動態的頁面變數,所以需要由首頁開始再依序抵達目標頁面,以上一頁的頁面變數來當作本頁的post參數
#index,由首頁出發
r = session.get(url_index)
if r.status_code == 401 :
print('\n認證錯誤')
sys.exit(0)
elif r.status_code != 200 :
print('\n網頁傳回錯誤碼')
sys.exit(0)
#history
r = session.get(url_history)
soup = BeautifulSoup(r.text, 'html.parser')
#取出url_history的動態參數值
VIEWSTATE = soup.find("input",{"id":"__VIEWSTATE"}).get('value')
VIEWSTATEGENERATOR = soup.find("input",{"id":"__VIEWSTATEGENERATOR"}).get('value')
EVENTVALIDATION = soup.find("input",{"id":"__EVENTVALIDATION"}).get('value')
#History Post Data
#注意要傳入__VIEWSTATE與__EVENTVALIDATION的值來當作連結下一頁的POST參數值
post_data = {
'eventtarget':'ctl00$ContentPlaceHolder1$lbtnQuery',
'eventargument':'',
'viewstate': VIEWSTATE,
'viewstategenerator': VIEWSTATEGENERATOR,
'eventvalidation' : EVENTVALIDATION,
'Sdate' : conSdate,
'Edate' : conEdate,
'status':'P'
}
payload = Apply_Postdata(post_data)
#如果__VIEWSTATE與__EVENTVALIDATION的值不符合,則會被導向Error的頁面,所以判斷Http Code會有問題
#因為雖然導向Error的頁面,但是還是成功連結到網頁,所以會回傳200的code碼
r = session.post(url_history,data=payload)
#以BeautifulSoup解析
soup = BeautifulSoup(r.text, 'html.parser')
history_url_link=[]
#findAll可以找出所有符合的值,findAll會返回list型態
for x in soup.findAll('tr'):
try:
if re.match('location.href',x['onclick']):
# 替換字串
history_url_link.append(x['onclick'].replace('location.href=\'..','http://test-bpm/Client').replace('\'',''))
except:pass
#History Pages
#主要是取出除了第一頁外,還有幾個分頁
#findAll的參數可帶入re模組來搜尋
history_pages = len(soup.findAll(href=re.compile("Page\$")))
print("帳號申請單分頁數:" + str(history_pages + 1))
print("單號 : 案號 : 申請帳號 : 姓名 : 申請日期")
#計算筆數
show_count = 0
#第一頁
for s in range(len(history_url_link)):
r = session.get(history_url_link[s])
#這裡額外用soup1,而不用soup,是因為下一個分頁需要soup的__VIEWSTATE與__EVENTVALIDATION
soup1 = BeautifulSoup(r.text,'html.parser')
#find會傳回符合的值,而findAll則是傳回list,因為只有一個id,故用find直接傳回值即可
href = 'http://test-bpm' + soup1.find("a",{"id":"ctl00_ContentPlaceHolder1_gvApprovalList_ctl02_hyForm"}).get('href')
r = session.get(href)
if r.status_code == 500 :
href = 'http://test-bpm' + soup1.find("a",{"id":"ctl00_ContentPlaceHolder1_gvApprovalList_ctl03_hyForm"}).get('href')
r = session.get(href)
if r.status_code == 500 :
print("Error: " + href)
soup1 = BeautifulSoup(r.text,'html.parser')
apply_detail_data = Account_Apply(soup1)
print(apply_detail_data)
show_count = show_count + 1
#其他頁
if history_pages > 0 :
for pp in range(history_pages):
VIEWSTATE = soup.find("input",{"id":"__VIEWSTATE"}).get('value')
VIEWSTATEGENERATOR = soup.find("input",{"id":"__VIEWSTATEGENERATOR"}).get('value')
EVENTVALIDATION = soup.find("input",{"id":"__EVENTVALIDATION"}).get('value')
post_data = {
'eventtarget':'ctl00$ContentPlaceHolder1$gvQueryActivity',
'eventargument':'Page$' + str(pp+2), #從0開始
#'eventargument':'Page$2',
'viewstate': VIEWSTATE,
'viewstategenerator': VIEWSTATEGENERATOR,
'eventvalidation' : EVENTVALIDATION,
'Sdate' : conSdate,
'Edate' : conEdate,
'status' : 'N'
}
payload = Apply_Postdata(post_data)
r = session.post(url_history,data=payload)
soup = BeautifulSoup(r.text, 'html.parser')
history_url_link=[]
for x in soup.findAll('tr'):
try:
if re.match('location.href',x['onclick']):
# 替換字串
history_url_link.append(x['onclick'].replace('location.href=\'..','http://test-bpm/Client').replace('\'',''))
except:pass
for s in range(len(history_url_link)):
r = session.get(history_url_link[s])
soup1 = BeautifulSoup(r.text,'html.parser')
href = 'http://test-bpm' + soup1.find("a",{"id":"ctl00_ContentPlaceHolder1_gvApprovalList_ctl02_hyForm"}).get('href')
r = session.get(href)
if r.status_code == 500 :
href = 'http://test-bpm' + soup1.find("a",{"id":"ctl00_ContentPlaceHolder1_gvApprovalList_ctl03_hyForm"}).get('href')
r = session.get(href)
if r.status_code == 500 :
print("Error: " + href)
soup1 = BeautifulSoup(r.text,'html.parser')
apply_detail_data = Account_Apply(soup1)
print(apply_detail_data)
show_count = show_count + 1
print("抓取資料筆數: " + str(show_count))
# 寫入檔案
#f = open('testfile.csv','w')
#f.write(w_line)
#f.close