Article Outline
Python mysql example 'crawler1'
Functions in program:
def parse_time_day(str):
def isday(compday):
def ready_parse(str):
def parselink(tr):
def parsebase(tr):
Modules used in program:
import re
import requests
import mysql.connector
python crawler1
Python mysql example: crawler1
import mysql.connector
from mysql.connector import errorcode
from bs4 import BeautifulSoup
import requests
import re
days = {"셀":"D0", "월":"D1", "화":"D2", "수":"D3", "목":"D4", "금":"D5", "토":"D6"}
# mysql connect
try:
cnn = mysql.connector.connect(
user = 'root',
password = 'hotspot1',
database = 'helltest'
)
print("It works")
except mysql.connector.Error as err:
if err.errno == errorcode.ER_ACCESS_DENIED_ERROR:
print("something is wrong with user or password")
elif err.errno == errorcode.ER_BAD_DB_ERROR:
print("Database Does not exist")
else:
print(err)
cursor = cnn.cursor(buffered=True)
def parsebase(tr):
data = []
tds = tr.find_all('td')
for td in tds:
data.append(td.text)
return data
def parselink(tr):
tds = tr.find('a')
tds = str(tds)
data = re.findall(r"'(.*)'", tds)
data = str(data).replace("'","").replace("[","").replace("]","")
return data
def ready_parse(str):
br = str.strip()
for day in days:
for key in day:
br = br.replace(key, key + ",")
return br
def isday(compday):
for day in days:
for key in day:
if compday == key:
return True
return False
def parse_time_day(str):
ready_p = ready_parse(str)
ans = {}
btr = re.split(r"[\:\,\.\(\)\/\s]", ready_p)
day_start = False
bring_day = ""
bring_time = ""
bring_class = ""
dayday = []
classclass = []
timetime = []
for elem in btr:
if elem == '':
continue
if isday(elem) == True:
if day_start == False:
day_start = True
else:
dayday.append(bring_day)
timetime.append(bring_time)
bring_time = ""
bring_day = days[elem]
continue
if day_start == True:
if elem.isdigit():
bring_time = bring_time + "T" + elem
# 숫자일 경우
else:
bring_class = elem
dayday.append(bring_day)
timetime.append(bring_time)
for i in range(0, len(timetime) - len(classclass)):
classclass.append(bring_class)
bring_day = ""
bring_time = ""
day_start = False
# 장소일 경우
if len(classclass) == 0:
for a in range(0, len(dayday)):
classclass.append('장소미정')
for idx, a in enumerate(dayday):
dd = dayday[idx]
tt = timetime[idx]
res1 = dd + tt
ans[res1] = classclass[idx]
return ans
url = 'http://sugang.inha.ac.kr/sugang/SU_51001/Lec_Time_Search.aspx'
r = requests.get(url)
soup = BeautifulSoup(r.text,'html.parser')
select = soup.find('select', id='ddlDept')
options = select.find_all('option')
majors = []
for o in options:
majors.append({
'name': o.text,
'value': o['value']
})
soup = BeautifulSoup(r.text,'html.parser')
select = soup.find('select', id='ddlKita')
options = select.find_all('option')
rests = []
for o in options:
rests.append({
'name': o.text,
'value': o['value']
})
event_target = ''
event_argument = ''
last_focus = ''
view_state = soup.find('input',id='__VIEWSTATE')['value']
view_state_generator = soup.find('input',id='__VIEWSTATEGENERATOR')['value']
event_validation = soup.find('input',id='__EVENTVALIDATION')['value']
# rdoKwamokGubun : 99 전체 / 06 전선 / 05 전필 / 02 교선 / 01 교필 / 09 일선 / 08 교직
for dept in majors:
r = requests.post(url,data={
'ddlDept':dept['value'],
'__EVENTTARGET': event_target,
'__EVENTARGUMENT': event_argument,
'__LASTFOCUS': last_focus,
'__VIEWSTATE': view_state,
'__VIEWSTATEGENERATOR': view_state_generator,
'__EVENTVALIDATION': event_validation,
'rdoKwamokGubun': '99',
'ibtnSearch1.x': '48',
'ibtnSearch1.y': '12',
'ibtnSearch1': '조회'
})
soup = BeautifulSoup(r.text, "html.parser")
table = soup.find('table', id='dgList')
trs = table.find_all('tr')
year = "2017"
semester = "spring"
for tr in trs:
timeclass = ""
code = ""
split_class = ""
title = ""
grade = ""
credit = 0
class_type = ""
instructor = ""
eval = ""
etc = ""
pr = parsebase(tr)
sub_link = parselink(tr)
for idx, res in enumerate(pr):
if idx == 0:
code = str(res)
elif idx == 1:
split_class = str(res)
elif idx == 2:
title = str(res)
elif idx == 3:
grade = str(res)
elif idx == 4:
credit = int(float(res))
elif idx == 5:
class_type = str(res)
elif idx == 6:
result = parse_time_day(res)
for key in result:
timeclass = timeclass + key + ":" + result[key] + ";"
elif idx == 7:
instructor = str(res)
elif idx == 8:
eval = str(res)
elif idx == 9:
etc = str(res)
siz = cursor.execute("select * from timetable where code = '%s'" % code)
isExists = cursor.fetchone()
if isExists == None:
Query = """INSERT INTO timetable (code, split_class, title, grade, credit, class_type, time, instructor, eval, etc, url, year, semester, dept_value, dept_name, active)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"""
cursor.execute(Query, (code, split_class, title, grade, credit, class_type, timeclass, instructor, eval, etc, sub_link, year, semester, dept['value'], dept['name'], 1))
cnn.commit()
for rest in rests:
r = requests.post(url, data={
'ddlKita':rest['value'],
'__EVENTTARGET': event_target,
'__EVENTARGUMENT': event_argument,
'__LASTFOCUS': last_focus,
'__VIEWSTATE': view_state,
'__VIEWSTATEGENERATOR': view_state_generator,
'__EVENTVALIDATION': event_validation,
'rdoKwamokGubun': '99',
'ibtnSearch2.x': '48',
'ibtnSearch2.y': '12',
'ibtnSearch2': '조회'
})
soup = BeautifulSoup(r.text, "html.parser")
table = soup.find('table', id='dgList')
trs = table.find_all('tr')
year = "2017"
semester = "spring"
for tr in trs:
timeclass = ""
code = ""
split_class = ""
title = ""
grade = ""
credit = 0
class_type = ""
instructor = ""
eval = ""
etc = ""
pr = parsebase(tr)
sub_link = parselink(tr)
for idx, res in enumerate(pr):
if idx == 0:
code = str(res)
elif idx == 1:
split_class = str(res)
elif idx == 2:
title = str(res)
elif idx == 3:
grade = str(res)
elif idx == 4:
credit = int(float(res))
elif idx == 5:
class_type = str(res)
elif idx == 6:
result = parse_time_day(res)
for key in result:
timeclass = timeclass + key + ":" + result[key] + ";"
elif idx == 7:
instructor = str(res)
elif idx == 8:
eval = str(res)
elif idx == 9:
etc = str(res)
siz = cursor.execute("select * from timetable where code = '%s'" % code)
isExists = cursor.fetchone()
if isExists == None:
Query = """INSERT INTO timetable (code, split_class, title, grade, credit, class_type, time, instructor, eval, etc, url, year, semester, dept_value, dept_name, active)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"""
cursor.execute(Query, (
code, split_class, title, grade, credit, class_type, timeclass, instructor, eval, etc, sub_link, year,
semester, rest["value"], rest["name"], 1))
cnn.commit()
cnn.close()
# 0 과목코드 / 1 분반그룹 / 2 과목명 / 3 학년 / 4 학점 / 5 과목구분 / 6 요일 및 강의실 / 7 담당교수 / 8 평가방식 / 9 비고
# 10 ? / 11 ? / 12 요일 시간 / 13 ?
# D0 : 웹강
# D1 ~ D7 : 월 ~ 금
# T1 ~ T25 : 1교시 ~ 25교시
Python links
- Learn Python: https://pythonbasics.org/
- Python Tutorial: https://pythonprogramminglanguage.com