HOME/Articles/

mysql example crawler1 (snippet)

Article Outline

Python mysql example 'crawler1'

Functions in program:

  • def parse_time_day(str):
  • def isday(compday):
  • def ready_parse(str):
  • def parselink(tr):
  • def parsebase(tr):

Modules used in program:

  • import re
  • import requests
  • import mysql.connector

python crawler1

Python mysql example: crawler1

import mysql.connector
from mysql.connector import errorcode
from bs4 import BeautifulSoup
import requests
import re

days = {"셀":"D0", "월":"D1", "화":"D2", "수":"D3", "목":"D4", "금":"D5", "토":"D6"}

# mysql connect
try:
    cnn = mysql.connector.connect(
        user = 'root',
        password = 'hotspot1',
        database = 'helltest'
    )
    print("It works")

except mysql.connector.Error as err:
    if err.errno == errorcode.ER_ACCESS_DENIED_ERROR:
        print("something is wrong with user or password")
    elif err.errno == errorcode.ER_BAD_DB_ERROR:
        print("Database Does not exist")
    else:
        print(err)

cursor = cnn.cursor(buffered=True)


def parsebase(tr):
    data = []
    tds = tr.find_all('td')
    for td in tds:
        data.append(td.text)
    return data


def parselink(tr):
    tds = tr.find('a')
    tds = str(tds)
    data = re.findall(r"'(.*)'", tds)
    data = str(data).replace("'","").replace("[","").replace("]","")
    return data


def ready_parse(str):
    br = str.strip()
    for day in days:
        for key in day:
            br = br.replace(key, key + ",")
    return br


def isday(compday):
    for day in days:
        for key in day:
            if compday == key:
                return True
    return False


def parse_time_day(str):
    ready_p = ready_parse(str)
    ans = {}
    btr = re.split(r"[\:\,\.\(\)\/\s]", ready_p)
    day_start = False
    bring_day = ""
    bring_time = ""
    bring_class = ""
    dayday = []
    classclass = []
    timetime = []

    for elem in btr:
        if elem == '':
            continue
        if isday(elem) == True:
            if day_start == False:
                day_start = True
            else:
                dayday.append(bring_day)
                timetime.append(bring_time)
                bring_time = ""
            bring_day = days[elem]
            continue
        if day_start == True:
            if elem.isdigit():
                bring_time = bring_time + "T" + elem
                # 숫자일 경우
            else:
                bring_class = elem
                dayday.append(bring_day)
                timetime.append(bring_time)
                for i in range(0, len(timetime) - len(classclass)):
                    classclass.append(bring_class)
                bring_day = ""
                bring_time = ""
                day_start = False
                # 장소일 경우

    if len(classclass) == 0:
        for a in range(0, len(dayday)):
            classclass.append('장소미정')

    for idx, a in enumerate(dayday):
        dd = dayday[idx]
        tt = timetime[idx]
        res1 = dd + tt
        ans[res1] = classclass[idx]
    return ans

url = 'http://sugang.inha.ac.kr/sugang/SU_51001/Lec_Time_Search.aspx'
r = requests.get(url)

soup = BeautifulSoup(r.text,'html.parser')
select = soup.find('select', id='ddlDept')
options = select.find_all('option')

majors = []

for o in options:
    majors.append({
        'name': o.text,
        'value': o['value']
    })

soup = BeautifulSoup(r.text,'html.parser')
select = soup.find('select', id='ddlKita')
options = select.find_all('option')

rests = []

for o in options:
    rests.append({
        'name': o.text,
        'value': o['value']
    })


event_target = ''
event_argument = ''
last_focus = ''
view_state = soup.find('input',id='__VIEWSTATE')['value']
view_state_generator = soup.find('input',id='__VIEWSTATEGENERATOR')['value']
event_validation = soup.find('input',id='__EVENTVALIDATION')['value']

# rdoKwamokGubun : 99 전체 / 06 전선 / 05 전필 / 02 교선 / 01  교필 / 09 일선 / 08 교직


for dept in majors:
    r = requests.post(url,data={
        'ddlDept':dept['value'],
        '__EVENTTARGET': event_target,
        '__EVENTARGUMENT': event_argument,
        '__LASTFOCUS': last_focus,
        '__VIEWSTATE': view_state,
        '__VIEWSTATEGENERATOR': view_state_generator,
        '__EVENTVALIDATION': event_validation,
        'rdoKwamokGubun': '99',
        'ibtnSearch1.x': '48',
        'ibtnSearch1.y': '12',
        'ibtnSearch1': '조회'
    })
    soup = BeautifulSoup(r.text, "html.parser")
    table = soup.find('table', id='dgList')
    trs = table.find_all('tr')
    year = "2017"
    semester = "spring"
    for tr in trs:
        timeclass = ""
        code = ""
        split_class = ""
        title = ""
        grade = ""
        credit = 0
        class_type = ""
        instructor = ""
        eval = ""
        etc = ""
        pr = parsebase(tr)
        sub_link = parselink(tr)
        for idx, res in enumerate(pr):
            if idx == 0:
                code = str(res)
            elif idx == 1:
                split_class = str(res)
            elif idx == 2:
                title = str(res)
            elif idx == 3:
                grade = str(res)
            elif idx == 4:
                credit = int(float(res))
            elif idx == 5:
                class_type = str(res)
            elif idx == 6:
                result = parse_time_day(res)
                for key in result:
                    timeclass = timeclass + key + ":" + result[key] + ";"
            elif idx == 7:
                instructor = str(res)
            elif idx == 8:
                eval = str(res)
            elif idx == 9:
                etc = str(res)
        siz = cursor.execute("select * from timetable where code = '%s'" % code)
        isExists = cursor.fetchone()
        if isExists == None:
            Query = """INSERT INTO timetable (code, split_class, title, grade, credit, class_type, time, instructor, eval, etc, url, year, semester, dept_value, dept_name, active)
                        VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"""
            cursor.execute(Query, (code, split_class, title, grade, credit, class_type, timeclass, instructor, eval, etc, sub_link, year, semester, dept['value'], dept['name'], 1))
            cnn.commit()

for rest in rests:
    r = requests.post(url, data={
        'ddlKita':rest['value'],
        '__EVENTTARGET': event_target,
        '__EVENTARGUMENT': event_argument,
        '__LASTFOCUS': last_focus,
        '__VIEWSTATE': view_state,
        '__VIEWSTATEGENERATOR': view_state_generator,
        '__EVENTVALIDATION': event_validation,
        'rdoKwamokGubun': '99',
        'ibtnSearch2.x': '48',
        'ibtnSearch2.y': '12',
        'ibtnSearch2': '조회'
    })
    soup = BeautifulSoup(r.text, "html.parser")
    table = soup.find('table', id='dgList')
    trs = table.find_all('tr')
    year = "2017"
    semester = "spring"
    for tr in trs:
        timeclass = ""
        code = ""
        split_class = ""
        title = ""
        grade = ""
        credit = 0
        class_type = ""
        instructor = ""
        eval = ""
        etc = ""
        pr = parsebase(tr)
        sub_link = parselink(tr)
        for idx, res in enumerate(pr):
            if idx == 0:
                code = str(res)
            elif idx == 1:
                split_class = str(res)
            elif idx == 2:
                title = str(res)
            elif idx == 3:
                grade = str(res)
            elif idx == 4:
                credit = int(float(res))
            elif idx == 5:
                class_type = str(res)
            elif idx == 6:
                result = parse_time_day(res)
                for key in result:
                    timeclass = timeclass + key + ":" + result[key] + ";"
            elif idx == 7:
                instructor = str(res)
            elif idx == 8:
                eval = str(res)
            elif idx == 9:
                etc = str(res)
        siz = cursor.execute("select * from timetable where code = '%s'" % code)
        isExists = cursor.fetchone()
        if isExists == None:
            Query = """INSERT INTO timetable (code, split_class, title, grade, credit, class_type, time, instructor, eval, etc, url, year, semester, dept_value, dept_name, active)
                        VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"""
            cursor.execute(Query, (
            code, split_class, title, grade, credit, class_type, timeclass, instructor, eval, etc, sub_link, year,
            semester, rest["value"], rest["name"], 1))
            cnn.commit()
cnn.close()


# 0 과목코드 / 1 분반그룹 / 2 과목명 / 3 학년 / 4 학점 / 5 과목구분 / 6 요일 및 강의실 / 7 담당교수 / 8 평가방식 / 9 비고
            # 10 ? / 11 ? / 12 요일 시간 / 13 ?

            # D0 : 웹강
            # D1 ~ D7 : 월 ~ 금
            # T1 ~ T25 : 1교시 ~ 25교시