HOME/Articles/

mysql example generateHeatmapData (snippet)

Article Outline

Python mysql example 'generateHeatmapData'

Functions in program:

  • def runQuery(query):

Modules used in program:

  • import sys
  • import progressbar
  • import mysql.connector

python generateHeatmapData

Python mysql example: generateHeatmapData

import mysql.connector
import progressbar
import sys

def runQuery(query):
    c.execute(query)
    (res,) = c.fetchone()
    return res

if len(sys.argv)<2:
    print("No tolerance given")
    exit()

tolerance=sys.argv[1]

# Boundaries based on the 99th percentile for character count and number of snippets
MAX_CHARS = 17000
CHAR_STEP = 850
MAX_SNIPPETS = 80
SNIPPET_STEP= 4

heatmap_data = []
# MySQL conifig
config = {
  'user': 'root',
  'password': 'password',
  'host': 'localhost',
  'database': 'jupyter',
  'raise_on_warnings': True
}

conn = mysql.connector.connect(**config)
c = conn.cursor(buffered=True)

# Progressbar stuff
counter = 0
bar = progressbar.ProgressBar(maxval=400,
                               widgets=[progressbar.Bar('=', '[', ']'), ' ',
                               progressbar.Percentage()])
bar.start()

# Iterate over the various ranges of snippets and character count
for j in range(0,MAX_SNIPPETS-SNIPPET_STEP, SNIPPET_STEP):
    snippet_lower_bound = j
    snippet_upper_bound = j+10

    for i in range(0,MAX_CHARS,CHAR_STEP):
        counter += 1
        bar.update(counter)

        char_lower_bound = i
        char_upper_bound = (i+CHAR_STEP)

        # Running with duplication tolerance larger than 1 will include complete clones
        query = '''SELECT AVG(DuplicationRatio)\
                   FROM Notebooks\
                   WHERE\
                        NoOfSnippets BETWEEN {} AND {}\
                        AND NoOfChars BETWEEN {} AND {}\
                        AND DuplicationRatio<{};'''.format(snippet_lower_bound,
                                                               snippet_upper_bound,
                                                               char_lower_bound,
                                                               char_upper_bound,
                                                               tolerance)

        result = runQuery(query)

        # Missing results are read as "NaN" by seaborn
        if not result:
            result = "NaN"
        range_result = "%s,%s,%s" % (snippet_upper_bound, char_upper_bound, result)
        heatmap_data.append(range_result)

bar.finish()
conn.close()

headers = "NumberOfSnippets,Chars,DuplicateFraction"
# Save result to file
with open("csv/heatmapData_{}.csv".format(tolerance), "w") as f:
    f.write(headers+'\n')
    for d in heatmap_data:
        f.write(d+'\n')

print("Finishes for duplication tolerance of {}".format(tolerance))