HOME/Articles/

mysql example data-mining-hw1-pm2 5-preprocessor (snippet)

Article Outline

Python mysql example 'data-mining-hw1-pm2 5-preprocessor'

Modules used in program:

  • import time
  • import _mysql

python data-mining-hw1-pm2 5-preprocessor

Python mysql example: data-mining-hw1-pm2 5-preprocessor

import _mysql
import time

FILE_NAMES = [
    'C:\Users\go-creating\Desktop\data-mining-hw1\dataset\data.log-20160918',
    'C:\Users\go-creating\Desktop\data-mining-hw1\dataset\data.log-20160919',
    'C:\Users\go-creating\Desktop\data-mining-hw1\dataset\data.log-20160920',
    'C:\Users\go-creating\Desktop\data-mining-hw1\dataset\data.log-20160921',
    'C:\Users\go-creating\Desktop\data-mining-hw1\dataset\data.log-20160922',
    'C:\Users\go-creating\Desktop\data-mining-hw1\dataset\data.log-20160923',
    'C:\Users\go-creating\Desktop\data-mining-hw1\dataset\data.log-20160924',
    'C:\Users\go-creating\Desktop\data-mining-hw1\dataset\data.log-20160925',
]
SKIP_LINE_COUNT = 26

db=_mysql.connect(
    host='localhost',
    user='root',
    passwd='root',
    db='data_mining'
)

for filename in FILE_NAMES:
    with open(filename) as f:
        print('================')
        print('start parsing', filename)

        # line indicator
        i = SKIP_LINE_COUNT
        startTime = time.time()

        for _ in range(SKIP_LINE_COUNT):
            next(f)
        for line in f:
            # debugger
            i = i + 1
            if i % 10000 == 0:
                print(i, 'entries parsed')

            # remove \n character
            line = line[:-1]
            lineParts = line.split(' ')
            # avoid empty line
            if len(lineParts) == 2:
                entries = lineParts[1].split('|')[1:]
                fields = ['`topic_group`']
                values = ['\'' + lineParts[0].split('/')[1] + '\'']

                for entry in entries:
                    parts = entry.replace('\'', '').split('=')
                    # avoid empty entry
                    if len(parts) == 2:
                        field = '`' + parts[0] + '`'
                        # avoid `NaN` sensor value
                        value = ('\'' + parts[1] + '\'').replace("'NaN'", 'NULL')
                        # avoid duplicate fields
                        if field not in fields:
                            fields.append(field)
                            values.append(value)
                queryString = (
                    """
                    INSERT INTO raw (%s) VALUES (%s)
                    """ % (','.join(fields), ','.join(values))
                )
                try:
                    db.query(queryString)
                except Exception as err:
                    # error reporting
                    print('')
                    print('==================== %d line ====================' % i)
                    print(line)
                    print('')
                    print(fields)
                    print('')
                    print(values)
                    print('')
                    print(err)

        elapsedTime = time.time() - startTime
        print(i - SKIP_LINE_COUNT, 'lines parsed in ', elapsedTime, 'seconds')