from frb_ml_utils import *
import frb_ml_utils
import numpy as np
import pandas as pd
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.metrics import fbeta_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from joblib import Parallel, delayed
= load_chime()
CHIME = ['bc_width','flux','fluence','dm_exc_ne2001',
columns_to_use 'peak_freq',
'bright_temp','rest_width','freq_width','energy']
'bright_temp'] = np.log10(CHIME['bright_temp'])
CHIME['energy'] = np.log10(CHIME['energy'])
CHIME['rest_width'] = CHIME['rest_width'] * 1000
CHIME['bc_width'] = CHIME['bc_width'] * 1000
CHIME['freq_width'] = np.log10(CHIME['freq_width'])
CHIME[
= 1000
bagging_times def calculate_f2(i):
= CHIME[columns_to_use]
chime_data = (CHIME['repeater_name'] != '-9999').to_numpy().astype('int')
chime_target = np.zeros(6)
temp_f2 = train_test_split(chime_data,chime_target,test_size=0.3,stratify=chime_target)
X,test_X,y,test_y
= StandardScaler()
scaler
scaler.fit(X) = scaler.transform(X)
X = scaler.transform(test_X)
test_X = scaler.transform(chime_data)
chime_data
= SMOTE().fit_resample(X, y)
X, y
= RandomForestClassifier()
clf
clf.fit(X, y)= clf.predict(test_X)
predictions 0] = fbeta_score(test_y,predictions,beta=2)
temp_f2[
= AdaBoostClassifier()
clf
clf.fit(X, y)= clf.predict(test_X)
predictions 1] = fbeta_score(test_y,predictions,beta=2)
temp_f2[
= LGBMClassifier()
clf
clf.fit(X, y)= clf.predict(test_X)
predictions 2] = fbeta_score(test_y,predictions,beta=2)
temp_f2[
= XGBClassifier(use_label_encoder=False,eval_metric='logloss')
clf
clf.fit(X, y)= clf.predict(test_X)
predictions 3] = fbeta_score(test_y,predictions,beta=2)
temp_f2[
= svm.SVC()
clf
clf.fit(X, y)= clf.predict(test_X)
predictions 4] = fbeta_score(test_y,predictions,beta=2)
temp_f2[
= NearestCentroid()
clf
clf.fit(X, y)= clf.predict(test_X)
predictions 5] = fbeta_score(test_y,predictions,beta=2)
temp_f2[
return temp_f2
= np.array(Parallel(n_jobs=6,verbose=10)(delayed(calculate_f2)(i) for i in range(bagging_times))) f2_list
2 78.8 0.00225301 FRB20180729A
12 101.5 0.00225301 FRB20180814A
38 101.0 0.00225301 FRB20180919A
49 94.7 0.00225301 FRB20180928A
75 101.3 0.00225301 FRB20181028A
76 101.3 0.00225301 FRB20181028A
77 101.3 0.00225301 FRB20181028A
78 101.3 0.00225301 FRB20181028A
79 101.3 0.00225301 FRB20181028A
81 62.3 0.00225301 FRB20181030A
82 62.5 0.00225301 FRB20181030B
158 83.6 0.00225301 FRB20181220A
174 92.6 0.00225301 FRB20181223C
221 96.1 0.00225301 FRB20190107B
399 100.8 0.00225301 FRB20190329A
459 79.4 0.00225301 FRB20190425A
571 100.7 0.00225301 FRB20190625E
572 100.7 0.00225301 FRB20190625E
573 100.7 0.00225301 FRB20190625E
576 101.5 0.00225301 FRB20190626A
[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done 1 tasks | elapsed: 1.9s
[Parallel(n_jobs=6)]: Done 6 tasks | elapsed: 1.9s
[Parallel(n_jobs=6)]: Done 13 tasks | elapsed: 2.8s
[Parallel(n_jobs=6)]: Done 20 tasks | elapsed: 3.4s
[Parallel(n_jobs=6)]: Done 29 tasks | elapsed: 3.9s
[Parallel(n_jobs=6)]: Done 38 tasks | elapsed: 4.9s
[Parallel(n_jobs=6)]: Done 49 tasks | elapsed: 5.7s
[Parallel(n_jobs=6)]: Done 60 tasks | elapsed: 6.4s
[Parallel(n_jobs=6)]: Done 73 tasks | elapsed: 7.6s
[Parallel(n_jobs=6)]: Done 86 tasks | elapsed: 8.7s
[Parallel(n_jobs=6)]: Done 101 tasks | elapsed: 9.8s
[Parallel(n_jobs=6)]: Done 116 tasks | elapsed: 11.2s
[Parallel(n_jobs=6)]: Done 133 tasks | elapsed: 12.8s
[Parallel(n_jobs=6)]: Done 150 tasks | elapsed: 14.0s
[Parallel(n_jobs=6)]: Done 169 tasks | elapsed: 15.6s
[Parallel(n_jobs=6)]: Done 188 tasks | elapsed: 17.1s
[Parallel(n_jobs=6)]: Done 209 tasks | elapsed: 19.4s
[Parallel(n_jobs=6)]: Done 230 tasks | elapsed: 21.5s
[Parallel(n_jobs=6)]: Done 253 tasks | elapsed: 23.9s
[Parallel(n_jobs=6)]: Done 276 tasks | elapsed: 26.7s
[Parallel(n_jobs=6)]: Done 301 tasks | elapsed: 30.9s
[Parallel(n_jobs=6)]: Done 326 tasks | elapsed: 35.0s
[Parallel(n_jobs=6)]: Done 353 tasks | elapsed: 38.9s
[Parallel(n_jobs=6)]: Done 380 tasks | elapsed: 41.8s
[Parallel(n_jobs=6)]: Done 409 tasks | elapsed: 44.9s
[Parallel(n_jobs=6)]: Done 438 tasks | elapsed: 47.7s
[Parallel(n_jobs=6)]: Done 469 tasks | elapsed: 50.8s
[Parallel(n_jobs=6)]: Done 500 tasks | elapsed: 53.7s
[Parallel(n_jobs=6)]: Done 533 tasks | elapsed: 57.1s
[Parallel(n_jobs=6)]: Done 566 tasks | elapsed: 1.0min
[Parallel(n_jobs=6)]: Done 601 tasks | elapsed: 1.1min
[Parallel(n_jobs=6)]: Done 636 tasks | elapsed: 1.2min
[Parallel(n_jobs=6)]: Done 673 tasks | elapsed: 1.3min
[Parallel(n_jobs=6)]: Done 710 tasks | elapsed: 1.4min
[Parallel(n_jobs=6)]: Done 749 tasks | elapsed: 1.5min
[Parallel(n_jobs=6)]: Done 788 tasks | elapsed: 1.6min
[Parallel(n_jobs=6)]: Done 829 tasks | elapsed: 1.6min
[Parallel(n_jobs=6)]: Done 870 tasks | elapsed: 1.7min
[Parallel(n_jobs=6)]: Done 913 tasks | elapsed: 1.8min
[Parallel(n_jobs=6)]: Done 956 tasks | elapsed: 1.9min
[Parallel(n_jobs=6)]: Done 1000 out of 1000 | elapsed: 1.9min finished
=0) np.average(f2_list,axis
array([0.78207573, 0.76660985, 0.7831917 , 0.78434569, 0.8179593 ,
0.71472412])
=0) np.std(f2_list,axis
array([0.06426469, 0.06167393, 0.06466454, 0.0630597 , 0.04827037,
0.06143691])
# decision tree (all features)
= load_chime()
CHIME = ['bc_width','flux','fluence','dm_exc_ne2001',
columns_to_use 'peak_freq',
'bright_temp','rest_width','freq_width','energy']
'bright_temp'] = np.log10(CHIME['bright_temp'])
CHIME['energy'] = np.log10(CHIME['energy'])
CHIME['rest_width'] = CHIME['rest_width'] * 1000
CHIME['bc_width'] = CHIME['bc_width'] * 1000
CHIME['freq_width'] = np.log10(CHIME['freq_width'])
CHIME[
= 1000
bagging_times def calculate_f2_tree(i):
= CHIME[columns_to_use]
chime_data = (CHIME['repeater_name'] != '-9999').to_numpy().astype('int')
chime_target = train_test_split(chime_data,chime_target,test_size=0.3,stratify=chime_target)
X,test_X,y,test_y
= SMOTE().fit_resample(X, y)
X, y = DecisionTreeClassifier(max_depth=5)
clf
clf.fit(X, y)= clf.predict(test_X)
predictions return fbeta_score(test_y,predictions,beta=2)
= np.array(Parallel(n_jobs=6,verbose=10)(delayed(calculate_f2_tree)(i) for i in range(bagging_times))) f2_tree_list
2 78.8 0.00225301 FRB20180729A
12 101.5 0.00225301 FRB20180814A
38 101.0 0.00225301 FRB20180919A
49 94.7 0.00225301 FRB20180928A
75 101.3 0.00225301 FRB20181028A
76 101.3 0.00225301 FRB20181028A
77 101.3 0.00225301 FRB20181028A
78 101.3 0.00225301 FRB20181028A
79 101.3 0.00225301 FRB20181028A
81 62.3 0.00225301 FRB20181030A
82 62.5 0.00225301 FRB20181030B
158 83.6 0.00225301 FRB20181220A
174 92.6 0.00225301 FRB20181223C
221 96.1 0.00225301 FRB20190107B
399 100.8 0.00225301 FRB20190329A
459 79.4 0.00225301 FRB20190425A
571 100.7 0.00225301 FRB20190625E
572 100.7 0.00225301 FRB20190625E
573 100.7 0.00225301 FRB20190625E
576 101.5 0.00225301 FRB20190626A
[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done 1 tasks | elapsed: 0.0s
[Parallel(n_jobs=6)]: Batch computation too fast (0.0132s.) Setting batch_size=2.
[Parallel(n_jobs=6)]: Done 6 tasks | elapsed: 0.0s
[Parallel(n_jobs=6)]: Done 14 tasks | elapsed: 0.0s
[Parallel(n_jobs=6)]: Batch computation too fast (0.0326s.) Setting batch_size=4.
[Parallel(n_jobs=6)]: Done 28 tasks | elapsed: 0.1s
[Parallel(n_jobs=6)]: Batch computation too fast (0.0642s.) Setting batch_size=8.
[Parallel(n_jobs=6)]: Done 56 tasks | elapsed: 0.1s
[Parallel(n_jobs=6)]: Batch computation too fast (0.1048s.) Setting batch_size=16.
[Parallel(n_jobs=6)]: Done 100 tasks | elapsed: 0.2s
[Parallel(n_jobs=6)]: Done 196 tasks | elapsed: 0.4s
[Parallel(n_jobs=6)]: Done 372 tasks | elapsed: 0.7s
[Parallel(n_jobs=6)]: Done 580 tasks | elapsed: 1.1s
[Parallel(n_jobs=6)]: Done 788 tasks | elapsed: 1.4s
[Parallel(n_jobs=6)]: Done 923 tasks | elapsed: 1.6s
[Parallel(n_jobs=6)]: Done 953 tasks | elapsed: 1.7s
[Parallel(n_jobs=6)]: Done 985 tasks | elapsed: 1.7s
[Parallel(n_jobs=6)]: Done 1000 out of 1000 | elapsed: 1.7s finished
np.average(f2_tree_list)
0.7351075923859341
np.std(f2_tree_list)
0.0631730026032148
# decision tree (T_B and delta nu)
= load_chime()
CHIME = ['bc_width','flux','fluence','dm_exc_ne2001',
columns_to_use 'peak_freq',
'bright_temp','rest_width','freq_width','energy']
'bright_temp'] = np.log10(CHIME['bright_temp'])
CHIME['energy'] = np.log10(CHIME['energy'])
CHIME['rest_width'] = CHIME['rest_width'] * 1000
CHIME['bc_width'] = CHIME['bc_width'] * 1000
CHIME['freq_width'] = np.log10(CHIME['freq_width'])
CHIME[= ['bright_temp','freq_width']
d2_columns = CHIME[d2_columns]
chime_data_2d
= 1000
bagging_times def calculate_f2_2d(i):
= CHIME[columns_to_use]
chime_data = (CHIME['repeater_name'] != '-9999').to_numpy().astype('int')
chime_target = train_test_split(chime_data_2d,chime_target,test_size=0.3,stratify=chime_target)
X_2d,test_X_2d,y_2d,test_y_2d = imbpipeline(steps = [['scaler', StandardScaler()],
clf 'smote', SMOTE()],
['classifier', DecisionTreeClassifier(max_depth=3)]])
[
clf.fit(X_2d, y_2d)= clf.predict(test_X_2d)
predictions return fbeta_score(test_y_2d,predictions,beta=2)
= np.array(Parallel(n_jobs=6,verbose=10)(delayed(calculate_f2_2d)(i) for i in range(bagging_times))) f2_2d_list
2 78.8 0.00225301 FRB20180729A
12 101.5 0.00225301 FRB20180814A
38 101.0 0.00225301 FRB20180919A
49 94.7 0.00225301 FRB20180928A
75 101.3 0.00225301 FRB20181028A
76 101.3 0.00225301 FRB20181028A
77 101.3 0.00225301 FRB20181028A
78 101.3 0.00225301 FRB20181028A
79 101.3 0.00225301 FRB20181028A
81 62.3 0.00225301 FRB20181030A
82 62.5 0.00225301 FRB20181030B
158 83.6 0.00225301 FRB20181220A
174 92.6 0.00225301 FRB20181223C
221 96.1 0.00225301 FRB20190107B
399 100.8 0.00225301 FRB20190329A
459 79.4 0.00225301 FRB20190425A
571 100.7 0.00225301 FRB20190625E
572 100.7 0.00225301 FRB20190625E
573 100.7 0.00225301 FRB20190625E
576 101.5 0.00225301 FRB20190626A
[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done 1 tasks | elapsed: 1.1s
[Parallel(n_jobs=6)]: Done 6 tasks | elapsed: 1.1s
[Parallel(n_jobs=6)]: Done 13 tasks | elapsed: 1.1s
[Parallel(n_jobs=6)]: Done 20 tasks | elapsed: 1.2s
[Parallel(n_jobs=6)]: Batch computation too fast (0.1727s.) Setting batch_size=2.
[Parallel(n_jobs=6)]: Done 29 tasks | elapsed: 1.2s
[Parallel(n_jobs=6)]: Batch computation too fast (0.0336s.) Setting batch_size=4.
[Parallel(n_jobs=6)]: Done 40 tasks | elapsed: 1.2s
[Parallel(n_jobs=6)]: Done 64 tasks | elapsed: 1.2s
[Parallel(n_jobs=6)]: Batch computation too fast (0.0444s.) Setting batch_size=8.
[Parallel(n_jobs=6)]: Done 108 tasks | elapsed: 1.3s
[Parallel(n_jobs=6)]: Batch computation too fast (0.0899s.) Setting batch_size=16.
[Parallel(n_jobs=6)]: Done 220 tasks | elapsed: 1.5s
[Parallel(n_jobs=6)]: Batch computation too fast (0.1745s.) Setting batch_size=32.
[Parallel(n_jobs=6)]: Done 460 tasks | elapsed: 1.9s
[Parallel(n_jobs=6)]: Done 878 tasks | elapsed: 2.3s
[Parallel(n_jobs=6)]: Done 893 tasks | elapsed: 2.3s
[Parallel(n_jobs=6)]: Done 1000 out of 1000 | elapsed: 2.3s finished
np.average(f2_2d_list)
0.7368846509966182
np.std(f2_2d_list)
0.049852263048514395