HOME/Articles/

共起ネットワーク作成

Article Outline

共起ネットワーク作成

ジャカード係数を計算して、共起ネットワーク作成。

%%time
# 共起ネットワークのためのdf作成
def make_jaccard_df(df_jacc, col):
    sentences = df_jacc[col].to_list()
    # 単語組み合わせのリスト作成
    sentence_combinations = [list(itertools.combinations(sentence, 2)) for sentence in tqdm(sentences[:])]
    sentence_combinations = [[tuple(sorted(words)) for words in sentence] for sentence in tqdm(sentence_combinations)]

    #  単語の組合せの1次元のリストに変形
    target_combinations = []
    for sentence in tqdm(sentence_combinations):
        target_combinations.extend(sentence)

    ### Jaccard係数を求める ###
    # Jaccard係数 = n(A ∩ B) / n(A ∪ B)
    # 同じ文内にある2つの単語の出現回数
    combi_count = Counter(target_combinations)

    # 単語の組合せと出現回数
    word_associates = []
    for key, value in tqdm(combi_count.items()):
        word_associates.append([key[0], key[1], value])

    word_associates = pd.DataFrame(word_associates, columns=['word1', 'word2', 'intersection_count'])

    # 和集合の計算 n(A ∪ B) = n(A) + n(B) - n(A ∩ B) を利用
    # それぞれの単語の出現回数を計算
    target_words = []
    for word in target_combinations:
        target_words.extend(word)

    word_count = Counter(target_words)
    word_count = [[key, value] for key, value in word_count.items()]
    word_count = pd.DataFrame(word_count, columns=['word', 'count'])

    # 単語の組合せの出現回数のデータにそれぞれの単語の出現回数を結合
    word_associates = pd.merge(word_associates, word_count, left_on='word1', right_on='word', how='left')
    word_associates.drop(columns=['word'], inplace=True)
    word_associates.rename(columns={'count': 'count1'}, inplace=True)
    word_associates = pd.merge(word_associates, word_count, left_on='word2', right_on='word', how='left')
    word_associates.drop(columns=['word'], inplace=True)
    word_associates.rename(columns={'count': 'count2'}, inplace=True)
    word_associates['union_count'] = word_associates['count1'] + word_associates['count2'] - word_associates['intersection_count']
    print('Jaccard係数の算出')
    word_associates['jaccard_coefficient'] = word_associates['intersection_count'] / word_associates['union_count']
    word_associates=word_associates.sort_values('jaccard_coefficient')
    return word_associates

# 共起ネットワークを表示する関数
def plot_network(data, edge_threshold=0., fig_size=(20, 10), file_name=None, dir_path=None, prog='fdp'):

    nodes = list(set(data['node1'].tolist()+data['node2'].tolist()))

    G = nx.Graph()
    #  頂点の追加
    G.add_nodes_from(nodes)

    #  辺の追加
    #  edge_thresholdで枝の重みの下限を定めている
    for i in range(len(data)):
        row_data = data.iloc[i]
        if row_data['value'] > edge_threshold:
            G.add_edge(row_data['node1'], row_data['node2'], weight=row_data['value'])

    # 孤立したnodeを削除
    isolated = [n for n in G.nodes if len([i for i in nx.all_neighbors(G, n)]) == 0]
    for n in isolated:
        G.remove_node(n)

    plt.figure(figsize=fig_size)
    #pos = nx.spring_layout(G, k=0.3)  # k = node間反発係数
    #pos = nx.kamada_kawai_layout(G)
    pos = nx.nx_agraph.graphviz_layout(G, prog=prog)

    pr = nx.pagerank(G)
    #pr = nx.degree_centrality(G)
    #pr = nx.eigenvector_centrality(G)
    #pr = nx.closeness_centrality(G)

    # nodeの大きさ
    nx.draw_networkx_nodes(G, pos, node_color=list(pr.values()),
                           cmap=plt.cm.Reds,
                           alpha=0.7,
                           node_size=[60000.*v for v in pr.values()])

    # 日本語ラベル
    nx.draw_networkx_labels(G, pos, font_size=12, font_family='IPAexGothic', font_weight="bold")

    # エッジの太さ調節
    edge_width = [d["weight"] * 300 for (u, v, d) in G.edges(data=True)]
    nx.draw_networkx_edges(G, pos, alpha=0.4, edge_color="darkgrey", width=edge_width)

    plt.axis('off')
    plt.title('data length:{}, edge threshold:{}'.format(round(len(word_associates2)), round(edge_threshold,4)))
    #plt.savefig("png/network_"+name+"_"+okng+".png")
    plt.show()


# 共起ネットワークのためのdf作成
word_associates = make_jaccard_df(df_jacc, 'all_connect_wakachiSentenceList')
display(word_associates)

# 制限つけて共起ネットワーク可視化
n_word_lower = len(df_jacc)*0.1
edge_threshold = word_associates['jaccard_coefficient'].quantile(0.999)#0.01
word_associates2=word_associates.copy()
word_associates2.query('count1 >= @n_word_lower & count2 >= @n_word_lower', inplace=True)
word_associates2.query('jaccard_coefficient >= @edge_threshold', inplace=True)
word_associates2.rename(columns={'word1':'node1', 'word2':'node2', 'jaccard_coefficient':'value'}, inplace=True)
word_associates2 = word_associates2[word_associates2['node1']!=word_associates2['node2']]
display(word_associates2.head())
plot_network(data=word_associates2, edge_threshold=edge_threshold, prog='fdp')  # circo, dot, fdp, neato, nop, nop1, nop2, osage, patchwork, sfdp, twopi