Sklearnでよく使われる方法
7024 ワード
トレーニングセットとテストセットを分割
統計Series値出現回数
いじょうデータしょり
ユニヒートコーディング
多項式拡張
標準化
正規化する
LabelEncoder
Dataframeサンプルサンプリング
LinearRegression
Ridge
Lasso
モデル評価
混同行列
モデルの保存
関数画像の描画
Dfコピー
Pythonコピー
CountVectorizer
TfidfVectorizer
apply
tfidf
from sklearn.model_selection import train_test_split
X = a.iloc[:,0:-1]
Y = a["label"]
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.5,random_state=0)
Y_train
統計Series値出現回数
a["feature_1"].value_counts()
いじょうデータしょり
a.replace('?', np.nan).dropna(how = 'any')
ユニヒートコーディング
import pandas as pd
a = pd.DataFrame([[1,2,3],
[4,5,6],
[1,8,9]],columns = ["feature_1", "feature_2", "label"])
from sklearn.preprocessing import OneHotEncoder
hotCoder=OneHotEncoder(sparse = False, handle_unknown = "ignore")
hot = hotCoder.fit_transform(a)
pd.DataFrame(hot)
b = pd.DataFrame([[1,2,3],
[4,5,6],
[10,8,9]],columns = ["feature_1", "feature_2", "label"])
hotCoder.transform(b)
多項式拡張
import pandas as pd
a = pd.DataFrame([[1,2,3],
[4,5,6],
[1,8,9]],columns = ["feature_1", "feature_2", "label"])
from sklearn.preprocessing import PolynomialFeatures
polyCoder = PolynomialFeatures(degree=2, include_bias=True, interaction_only=False)
df = polyCoder.fit_transform(a)
pd.DataFrame(df, columns=polyCoder.get_feature_names())
標準化
import pandas as pd
a = pd.DataFrame([[1,2,3],
[4,5,6],
[7,8,9]],columns = ["feature_1", "feature_2", "label"])
from sklearn.preprocessing import StandardScaler
ssCoder = StandardScaler()
df = ssCoder.fit_transform(a)
pd.DataFrame(df)
正規化する
import pandas as pd
a = pd.DataFrame([[1,2,3],
[4,5,6],
[7,8,9]],columns = ["feature_1", "feature_2", "label"])
from sklearn.preprocessing import MinMaxScaler
ssCoder = MinMaxScaler(feature_range=[-1,2])
df = ssCoder.fit_transform(a)
pd.DataFrame(df)
LabelEncoder
from sklearn.preprocessing import LabelEncoder
import pandas as pd
a = pd.DataFrame([["b",2,3],
["a",5,6],
["a",8,9]],columns = ["feature_1", "feature_2", "label"])
laCoder = LabelEncoder()
b = pd.DataFrame(laCoder.fit_transform(a["feature_1"]))
pd.concat([a,b],axis=1)
Dataframeサンプルサンプリング
df = a.sample(frac=0.66)
df = a.sample(n=3)
pd.concat([a,df])
LinearRegression
import numpy as np
X = np.mat([[1,1],[2,1],[3,1],[4,1]])
Y = np.mat([[3.2],[4.7],[7.3],[8.5]])
from sklearn.linear_model import LinearRegression
model = LinearRegression(fit_intercept=False)
model.fit(X,Y)
model.coef_
model.score(X,Y)
Ridge
from sklearn.linear_model import Ridge
for alpha in [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 2, 3, 5, 10]:
clf = Ridge(alpha=alpha, max_iter=2000, solver="auto",fit_intercept=True)
clf.fit(X_train, Y_train)
print("Ridge:",mse(Y_test.values, clf.predict(X_test)))
print(clf.n_iter_)
Lasso
from sklearn.linear_model import Lasso
for alpha in [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 2, 3]:
clf = Lasso(alpha=alpha, max_iter=100, fit_intercept=True)
clf.fit(X_train, Y_train)
print("Lasso:",mse(Y_test.values, clf.predict(X_test)))
print(clf.n_iter_)
モデル評価
from sklearn.metrics import mean_squared_error
print("LinearRegression:",mean_squared_error(Y_test.values, clf.predict(X_test)))
混同行列
pd.crosstab(Y_test,knn.predict(X_test),rownames=["label"],colnames=["predict"])
モデルの保存
from sklearn.externals import joblib
joblib.dump(enc,'rf.model')
enc2 = joblib.load('rf.model')
b = enc2.transform(a).toarray()
pd.DataFrame(b)
関数画像の描画
import numpy as np
import matplotlib.pyplot as plt
x=np.linspace(-5,5,1000) # -5 5 1000 x
y=[1/(1+np.exp(-i)) for i in x] # 1000 sigmoid y
plt.plot(x,y) # 1000 xy 1000
plt.show() #
Dfコピー
import pandas as pd
a = pd.DataFrame([[1,2,3],
[4,5,6],
[7,8,9]],columns = ["feature_1", "feature_2", "label"])
df = a.copy()
df.drop(columns=["feature_1"],inplace=True)
print(id(a))
print(id(df))
a
Pythonコピー
import copy
a = [1,2,[1,2]]
b = copy.deepcopy(a)
a[2][0] = -1
b
CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
corpus = [
' ',
' '
]
y = [0,1]
vectorizer = CountVectorizer(token_pattern="[a-zA-Z|\u4e00-\u9fa5]+")
count = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names())
print(count.toarray())
transformer = TfidfTransformer()
tfidf_matrix = transformer.fit_transform(count)
print(tfidf_matrix.toarray())
tfidf_vec = TfidfVectorizer(token_pattern="[a-zA-Z|\u4e00-\u9fa5]+")
tfidf_matrix = tfidf_vec.fit_transform(corpus)
print(tfidf_matrix.toarray())
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model.fit(tfidf_matrix.toarray(),y)
print(model.predict(tfidf_matrix.toarray()))
corpus = [
' ',
' '
]
tfidf_matrix = tfidf_vec.transform(corpus)
model.predict(tfidf_matrix.toarray())
TfidfVectorizer
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
df = pd.read_csv("datas/bayes.txt",header=None)
X = df[1]
Y = df[0]
tfCoder = TfidfVectorizer(token_pattern="[a-zA-Z|\u4e00-\u9fa5]+")
X = tfCoder.fit_transform(X).toarray()
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0, random_state=42)
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model.fit(X_train,y_train)
print(model.predict(X_train))
print(y_train.values)
apply
from sklearn import preprocessing
import pandas as pd
enc = preprocessing.OneHotEncoder(categorical_features=[0,1])
a = pd.DataFrame([[1,"A","a"],
[0,"B","b"],
[2,"C","c"]],columns = ["ebayno", "p_sku", "sale"])
def f(x):
i = x.index
v = x.values*2
print(v)
return pd.Series(v,i)
a.apply(f)
tfidf
corpus=["hi peter",
"hi tom"]
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf2 = TfidfVectorizer(norm=None)
re = tfidf2.fit_transform(corpus)
print(tfidf2.vocabulary_)
print(tfidf2.get_feature_names())
print(re.todense())