train_test_splits

ÇнÀ µ¥ÀÌÅÍ¿Í Å×½ºÆ® µ¥ÀÌÅ͸¦ ºÐ¸® ÇÒ¶§´Â »çÀÌŶ ·±ÀÇ train_test_splits ÇÔ¼ö¸¦ ÀÌ¿ëÇÑ´Ù.

import numpy as np
from sklearn.model_selection import train_test_split

col = np.arange(1, 11).reshape(10, 1)
data = [[1], [2], [1], [2], [1], [2], [1], [2], [1], [2]]
data = np.append(data, col, 1)
data

array([[ 1,  1],
       [ 2,  2],
       [ 1,  3],
       [ 2,  4],
       [ 1,  5],
       [ 2,  6],
       [ 1,  7],
       [ 2,  8],
       [ 1,  9],
       [ 2, 10]])

col¿¡ 1ºÎÅÍ  10±îÁö 1Â÷¿ø ¹è¿­¿¡ ³Ö´Â´Ù.
reshape·Î ÇàÀÇ Å©±â°¡ 10, ¿­ÀÇ Å©±â°¡ 1ÀÎ ¹è¿­·Î º¯È¯ÇÑ´Ù.
ÇàÀ» Row·Î ¿­À» ColumnÀ̶ó ºÎ¸¥´Ù.

data 0Ä®·³¿¡ ¼Ó¼ºÀÌ 1À̳ª 2ÀÎ °ªÀ» ³Ö´Â´Ù.
data¿¡ colÀ» ¿­·Î Ãß°¡ÇÑ´Ù.
append 3¹ø° ¸Å°³º¯¼ö°¡ 1ÀÌ¸é ¿­(Column) Ãß°¡ÀÌ´Ù.

newcol = col + 10
newdata = np.append(data, newcol, 1)
newdata

array([[ 1,  1, 11],
       [ 2,  2, 12],
       [ 1,  3, 13],
       [ 2,  4, 14],
       [ 1,  5, 15],
       [ 2,  6, 16],
       [ 1,  7, 17],
       [ 2,  8, 18],
       [ 1,  9, 19],
       [ 2, 10, 20]])

data¿Í newcol·Î newdata¸¦ ¸¸µç´Ù.

x = newdata[:, 0:2]
y = newdata[:, 2]
x

array([[ 1,  1],
       [ 2,  2],
       [ 1,  3],
       [ 2,  4],
       [ 1,  5],
       [ 2,  6],
       [ 1,  7],
       [ 2,  8],
       [ 1,  9],
       [ 2, 10]])

µ¥ÀÌÅ͸¦ x, y ¸®½ºÆ®¿¡ ´ã´Â´Ù.

seed = 5
#x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=seed)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=seed, stratify=newdata[:, 0:1], shuffle=True)
print('x_train Àüü µ¥ÀÌÅÍ %d: ' % len(x_train))
print('x_test Àüü µ¥ÀÌÅÍ %d:' % len(x_test))
x_test

x_train Àüü µ¥ÀÌÅÍ 8:
x_test Àüü µ¥ÀÌÅÍ 2:
array([[2, 2],
       [1, 7]])

»çÀÌŶ·±ÀÇ train_test_split ÇÔ¼ö·Î ÇнÀ µ¥ÀÌÅÍ¿Í Å×½ºÆ® µ¥ÀÌÅÍ·Î ºÐ¸® ÇÒ¼ö ÀÖ´Ù.

test_size=0.2
Å×½ºÆ® µ¥ÀÌÅÍÀÇ Å©±âÀÇ ºñÀ²À» Àüü°¡ 1À϶§ 0.2ÀÇ ºñÀ²·Î ºÐ¸®ÇÑ´Ù. ¿©±â¼­´Â Àüü°¡ 10°³, Å×½ºÆ® µ¥ÀÌÅÍ´Â 2°³ÀÌ´Ù.

random_state=seed
·£´ý ½ÃµåÀÇ °ªÀ» ¼³Á¤ ÇÑ´Ù.

stratify=newdata[:, 0:1]
0¹ø° Ä®·³ÀÇ 1, 2°ªÀÌ °ñ°í·ç Æ÷ÇÔ µÇµµ·Ï ÇÑ´Ù.
stratify °ªÀ» ¼³Á¤ ÇÏÁö ¾ÊÀ¸¸é ´ÙÀ½°ú °°ÀÌ x_test °ªÀÌ ºÐ¸® µÉ¼ö ÀÖ´Ù.
¿©·¯ °ªÀÌ Æ÷ÇÔÀÌ µÇ¾úÀ»¶§ °úÀûÇÕÀ» ¸·À»¼ö ÀÖ´Ù.
array([[2, 2],
       [2, 8]])

shuffle=True
shuffleÀ» ÇÏ´Â ÀÌÀ¯´Â ¹Ì´Ï ¹èÄ¡¸¦ ÇÒ¶§ ±â¿ï±âÀÇ Æò±ÕÀ¸·Î ÇнÀÇÒ¶§ À߸øµÈ ¹æÇâÀ¸·Î ÇнÀÇÒ¼ö Àֱ⠶§¹®ÀÌ´Ù.

ÀüüÄÚµå´Â ´ÙÀ½°ú °°´Ù.
import numpy as np
from sklearn.model_selection import train_test_split

col = np.arange(1, 11).reshape(10, 1)
data = [[1], [2], [1], [2], [1], [2], [1], [2], [1], [2]]
data = np.append(data, col, 1)

newcol = col + 10
newdata = np.append(data, newcol, 1)

x = newdata[:, 0:2]
y = newdata[:, 2]

seed = 5
#x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=seed)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=seed, stratify=newdata[:, 0:1], shuffle=True)
print('x_train Àüü µ¥ÀÌÅÍ %d: ' % len(x_train))
print('x_test Àüü µ¥ÀÌÅÍ %d:' % len(x_test))