平凡的幸福: Python BeautifulSoup模組爬蟲BPM帳號申請單程式

爬蟲程式可以讓我們取出網頁中指定的元素並加以利用

BeautifulSoup是Python的一個解析HTML的模組，方便我們找出HTML元素並加以利用。

因為BPM是以ASP.NET設計，所以在分析Request上需要有一些注意

ASP.NET在傳送Request時，會有幾個特殊的動態參數，這些參數值會依不同的頁面而不同，所以該參數的內容必須是上一頁的值，否則會產生Error

我是以Chrome的開發人員工具來觀察Request與Response，ASP.NET在Request（POST模式）時，會有下列特殊的動態參數：

__EVENTTARGET、__EVENTARGUMENT、__VIEWSTATE、__VIEWSTATEGENERATOR、__VIEWSTATEENCRYPTED、__EVENTVALIDATION

其中__VIEWSTATE、__EVENTVALIDATION的值必需是上一頁Response傳回的值，例如由index.htm -> a.htm -> b.htm

由index.htm按下連結至a.htm後，a.htm除了有HTML之後，還加上__VIEWSTATE、__EVENTVALIDATION的值，

而如果要連結到b.htm時，由a.htm發出的POST Request就必須包含b.htm的__VIEWSTATE、__EVENTVALIDATION的值

import requests

from bs4 import BeautifulSoup

#HttpNtlmAuth模組可以讓Python登入Windows網域

from requests_ntlm import HttpNtlmAuth

import re

from datetime import datetime,timedelta

import sys

#帳號申請單分析

def Account_Apply(soup) :

txtUserNo = str.strip(soup.find('input',{"id":"txtUserNo"}).get("value")) #申請的帳號

txtUserName = str.strip(soup.find('input',{"id":"txtUserName"}).get("value")) #新帳號的姓名

txtPO = str(soup.find('input',{"id":"txtPO"}).get("value")) #單號

PID = str(soup.find('span',{"id":"Label28"}).contents[0]) #案號

txtStartDate = str(soup.find('input',{"id":"txt_StarDate"}).get("value")) #申請日期

return (txtPO + " : " + PID + " : " + txtUserNo + " : " + txtUserName + " : " + txtStartDate)

#Post Data

def Apply_Postdata(post_dict) :

return {

'__EVENTTARGET': post_dict['eventtarget'] ,

'__EVENTARGUMENT': post_dict['eventargument'],

'__VIEWSTATE': post_dict['viewstate'],

'__VIEWSTATEGENERATOR': post_dict['viewstategenerator'],

'__VIEWSTATEENCRYPTED': '',

'__EVENTVALIDATION': post_dict['eventvalidation'],

'ctl00$TopMenu1$cboLanguage': '1',

'ctl00$ContentPlaceHolder1$cboProcessType': '0', #處理種類不限

'ctl00$ContentPlaceHolder1$cboFlowStatus': 'N', #狀態不限

'ctl00$ContentPlaceHolder1$cboFlowid': '8', #帳號申請單

'ctl00$ContentPlaceHolder1$txtSdate': post_dict['Sdate'],

'ctl00$ContentPlaceHolder1$txtEdate': post_dict['Edate'],

'ctl00$ContentPlaceHolder1$txtisMercedesbenz': '0', #隱藏欄位，固定為0

'ctl00$ContentPlaceHolder1$txtStatus': post_dict['status'] , #隱藏欄位

'ctl00$ContentPlaceHolder1$txtmemo': '',

'ctl00$ContentPlaceHolder1$txtActiveid': '',

'ctl00$ContentPlaceHolder1$txtDisplayname': '',

'ctl00$ContentPlaceHolder1$txt_ApplyName': ''

}

#Configure

url_history = 'http://test-bpm/Client/Search/History.aspx'

url_index = 'http://test-bpm/Client/MyPage/index.aspx'

nt_user = ''

nt_passwd = ''

conSdate = ''

conEdate = ''

#取得Windows帳號及密碼

while nt_user is None or nt_user=='':

nt_user = input("帳號(Domain\\UserID): ")

while nt_passwd is None or nt_passwd=='':

nt_passwd = input("密碼(password): ")

while conSdate is None or conSdate=='':

conSdate = input("日期起(yyyy/mm/dd): ")

while conEdate is None or conEdate=='':

conEdate = input("日期迄(yyyy/mm/dd): ")

try:

datetime.strptime(conSdate,'%Y/%m/%d')

datetime.strptime(conEdate,'%Y/%m/%d')

except:

print("\n日期格式錯誤")

sys.exit(0)

#設定一個Http Session，讓下面的步驟都使用同一個Session

session = requests.Session()

session.auth = HttpNtlmAuth(nt_user,nt_passwd)

#因為ASP.net的POST會有動態的頁面變數，所以需要由首頁開始再依序抵達目標頁面，以上一頁的頁面變數來當作本頁的post參數

#index，由首頁出發

r = session.get(url_index)

if r.status_code == 401 :

print('\n認證錯誤')

sys.exit(0)

elif r.status_code != 200 :

print('\n網頁傳回錯誤碼')

sys.exit(0)

#history

r = session.get(url_history)

soup = BeautifulSoup(r.text, 'html.parser')

#取出url_history的動態參數值

VIEWSTATE = soup.find("input",{"id":"__VIEWSTATE"}).get('value')

VIEWSTATEGENERATOR = soup.find("input",{"id":"__VIEWSTATEGENERATOR"}).get('value')

EVENTVALIDATION = soup.find("input",{"id":"__EVENTVALIDATION"}).get('value')

#History Post Data

#注意要傳入__VIEWSTATE與__EVENTVALIDATION的值來當作連結下一頁的POST參數值

post_data = {

'eventtarget':'ctl00$ContentPlaceHolder1$lbtnQuery',

'eventargument':'',

'viewstate': VIEWSTATE,

'viewstategenerator': VIEWSTATEGENERATOR,

'eventvalidation' : EVENTVALIDATION,

'Sdate' : conSdate,

'Edate' : conEdate,

'status':'P'

}

payload = Apply_Postdata(post_data)

#如果__VIEWSTATE與__EVENTVALIDATION的值不符合，則會被導向Error的頁面，所以判斷Http Code會有問題

#因為雖然導向Error的頁面，但是還是成功連結到網頁，所以會回傳200的code碼

r = session.post(url_history,data=payload)

#以BeautifulSoup解析

soup = BeautifulSoup(r.text, 'html.parser')

history_url_link=[]

#findAll可以找出所有符合的值，findAll會返回list型態

for x in soup.findAll('tr'):

try:

if re.match('location.href',x['onclick']):

# 替換字串

history_url_link.append(x['onclick'].replace('location.href=\'..','http://test-bpm/Client').replace('\'',''))

except:pass

#History Pages

#主要是取出除了第一頁外，還有幾個分頁

#findAll的參數可帶入re模組來搜尋

history_pages = len(soup.findAll(href=re.compile("Page\$")))

print("帳號申請單分頁數：" + str(history_pages + 1))

print("單號 : 案號 : 申請帳號 : 姓名 : 申請日期")

#計算筆數

show_count = 0

#第一頁

for s in range(len(history_url_link)):

r = session.get(history_url_link[s])

#這裡額外用soup1，而不用soup，是因為下一個分頁需要soup的__VIEWSTATE與__EVENTVALIDATION

soup1 = BeautifulSoup(r.text,'html.parser')

#find會傳回符合的值，而findAll則是傳回list，因為只有一個id，故用find直接傳回值即可

href = 'http://test-bpm' + soup1.find("a",{"id":"ctl00_ContentPlaceHolder1_gvApprovalList_ctl02_hyForm"}).get('href')

r = session.get(href)

if r.status_code == 500 :

href = 'http://test-bpm' + soup1.find("a",{"id":"ctl00_ContentPlaceHolder1_gvApprovalList_ctl03_hyForm"}).get('href')

r = session.get(href)

if r.status_code == 500 :

print("Error: " + href)

soup1 = BeautifulSoup(r.text,'html.parser')

apply_detail_data = Account_Apply(soup1)

print(apply_detail_data)

show_count = show_count + 1

#其他頁

if history_pages > 0 :

for pp in range(history_pages):

VIEWSTATE = soup.find("input",{"id":"__VIEWSTATE"}).get('value')

VIEWSTATEGENERATOR = soup.find("input",{"id":"__VIEWSTATEGENERATOR"}).get('value')

EVENTVALIDATION = soup.find("input",{"id":"__EVENTVALIDATION"}).get('value')

post_data = {

'eventtarget':'ctl00$ContentPlaceHolder1$gvQueryActivity',

'eventargument':'Page$' + str(pp+2), #從0開始

#'eventargument':'Page$2',

'viewstate': VIEWSTATE,

'viewstategenerator': VIEWSTATEGENERATOR,

'eventvalidation' : EVENTVALIDATION,

'Sdate' : conSdate,

'Edate' : conEdate,

'status' : 'N'

}

payload = Apply_Postdata(post_data)

r = session.post(url_history,data=payload)

soup = BeautifulSoup(r.text, 'html.parser')

history_url_link=[]

for x in soup.findAll('tr'):

try:

if re.match('location.href',x['onclick']):

# 替換字串

history_url_link.append(x['onclick'].replace('location.href=\'..','http://test-bpm/Client').replace('\'',''))

except:pass

for s in range(len(history_url_link)):

r = session.get(history_url_link[s])

soup1 = BeautifulSoup(r.text,'html.parser')

href = 'http://test-bpm' + soup1.find("a",{"id":"ctl00_ContentPlaceHolder1_gvApprovalList_ctl02_hyForm"}).get('href')

r = session.get(href)

if r.status_code == 500 :

href = 'http://test-bpm' + soup1.find("a",{"id":"ctl00_ContentPlaceHolder1_gvApprovalList_ctl03_hyForm"}).get('href')

r = session.get(href)

if r.status_code == 500 :

print("Error: " + href)

soup1 = BeautifulSoup(r.text,'html.parser')

apply_detail_data = Account_Apply(soup1)

print(apply_detail_data)

show_count = show_count + 1

print("抓取資料筆數： " + str(show_count))

# 寫入檔案

#f = open('testfile.csv','w')

#f.write(w_line)

#f.close