HOME/🔍 BigQuery/

Google BigQuery ML

Article Outline

|| BigQueryMLでAutoML

| データセット収集

オープンデータ(人工知能研究センター)に色々あるよ!

今回は、カリフォルニア大学アーバイン校Machine Learningデータセットのリポジトリから公開されている『顧客解約予測データセット』

| クエリ

#standardSQL
/*
 * ◇ BigQuery ML
 *   先に、「MODEL_DEVLOP」を実行。実行後、コメントアウトしてView化すると便利
 *   後に、「MODEL_EVALUATE」「MODEL_PREDICT」を実行。
 */

/* MODEL_DEVLOP */
    -- create or replace model `my_dataset`.auto_ml_1 -- モデル名
    -- options(
    --       model_type = 'AUTOML_CLASSIFIER' -- 使用アルゴリズム
    --     -- , input_label_cols = [''] -- ターゲット名(カラム)
    --     , budget_hours = 1.0 -- 時間制限
    -- ) as
    -- /* クエリ(学習に使用するデータを抽出する) */
    -- select
    --     State
    --     , Account_Length
    --     , Area_Code
    --     , Total_Charge / Account_Length as avg_daily_spend
    --     , CustServ_Calls / Account_Length as avg_daily_cases
    --     , Churn_ as label -- 推論するラベル
    -- from  
    --     `my_dataset.CSV_CUSTOMER_ACTIVITY`
    -- where
    --     date(2020, 1, 1) <= Record_Date
    -- ;



with

/* MODEL_EVALUATE */

    evaluation as (
        select * from ml.evaluate(model `my_dataset`.auto_ml_1, (
            /* サブクエリ(モデル作成に使用した特徴量を抽出するクエリ) */
            select
                State
                , Account_Length
                , Area_Code
                , Total_Charge / Account_Length as avg_daily_spend
                , CustServ_Calls / Account_Length as avg_daily_cases
                , Churn_ as label
            from  
                `my_dataset.CSV_CUSTOMER_ACTIVITY`
            where
                date(2020, 1, 1) <= Record_Date
            )
        )
    )



/* MODEL_PREDICT */

    , predict as (
        select
            predicted_label
            , prob
            , State
            , Account_Length
            , avg_daily_spend
            , avg_daily_cases
        from ml.predict(model `my_dataset`.auto_ml_1, (
            /* サブクエリ(予測するのに必要な特徴量を抽出するクエリ) */
            select
                State
                , Account_Length
                , Area_Code
                , Total_Charge / Account_Length as avg_daily_spend
                , CustServ_Calls / Account_Length as avg_daily_cases
                -- , Churn_ as label -- (推論時不要)
            from  
                `my_dataset.CSV_CUSTOMER_ACTIVITY`
            where
                date(2020, 1, 1) <= Record_Date
            )
        ), unnest(predicted_label_probs)
        where
            predicted_label = 'True.' -- ∵ 2値分類のため
    )



/* OUTPUT */

select * from 
evaluation
-- predict
;
(error)Unable to identify the label column in query statement. Either specify the label column using OPTIONS(input_label_cols=['your_label_col']) or name the label column in the data as 'label'.

BigQuery ML unable to identify label column in data - stack overflow feedbackThe CREATE MODEL statement - Google Cloud

| Taitaic

「BigQueryML」でSQLを書いて機械学習モデルを構築&予測できる! - Qiita

#standardSQL
begin
/* モデル作成 */
    create model `Kaggle_titanic`.model_titanic
    options (model_type = 'logistic_reg') as
    select
        Pclass
        , title
        , is_female
        , family_size
        , is_alone
        , age
        , embarked
        , fare
        , class_age
        , Survived as label
    from 
        `Kaggle_titanic.preprocessed_data` -- 前処理終わり
    left join 
        `Kaggle_titanic.label_train` using(PassengerId)
    where 
        train_flag = 'train'
    ;
/* モデル評価 */
    select * from ml.evaluate(model `Kaggle_titanic`.model_titanic, (
        select Pclass, title, is_female, family_size, is_alone, age, embarked, fare, class_age, Survived as label
        from `Kaggle_titanic.preprocessed_data`
        left join `Kaggle_titanic.label_train` using(PassengerId)
        where train_flag = 'train'
        )
    );
/* モデル推論(テストデータに対して) */
    select * from ml.predict(model `Kaggle_titanic`.model_titanic, (
        select Pclass, title, is_female, family_size, is_alone, age, embarked, fare, class_age, Survived as label
        from `Kaggle_titanic.preprocessed_data`
        where train_flag = 'test'
        )
    );



/* 係数算出(特徴量の重み) */
    select 
        processed_input
        , weight
    from 
        ml.weights(model `Kaggle_titanic.model_titanic`)
    order by  
        abs(weight) desc
    ;

/* サブミット作成 (※Kaggle用) */
    select 
        PassengerID
        , predicted_label as Survived
    from ml.predict(model `Kaggle_titanic`.model_titanic, (
        select PassengerID, Pclass, title, is_female, family_size, is_alone, age, embarked, fare, class_age, Survived as label
        from `Kaggle_titanic.preprocessed_data`
        where train_flag = 'test'
        )
    );
end

|| REFERENCE