--- title: Mixed data keywords: fastai sidebar: home_sidebar summary: "DataLoader than can take data from multiple dataloaders with different types of data" description: "DataLoader than can take data from multiple dataloaders with different types of data" nb_path: "nbs/022_data.mixed.ipynb" ---
from tsai.data.tabular import *
path = untar_data(URLs.ADULT_SAMPLE)
df = pd.read_csv(path/'adult.csv')
# df['salary'] = np.random.rand(len(df)) # uncomment to simulate a cont dependent variable
target = 'salary'
splits = RandomSplitter()(range_of(df))
cat_names = ['workclass', 'education', 'marital-status']
cont_names = ['age', 'fnlwgt']
dls1 = get_tabular_dls(df, cat_names=cat_names, cont_names=cont_names, y_names=target, splits=splits, bs=512)
dls1.show_batch()
cat_names = None #['occupation', 'relationship', 'race']
cont_names = ['education-num']
dls2 = get_tabular_dls(df, cat_names=cat_names, cont_names=cont_names, y_names=target, splits=splits, bs=128)
dls2.show_batch()
dls = get_mixed_dls(dls1, dls2, bs=8)
first(dls.train)
first(dls.valid)
torch.save(dls,'export/mixed_dls.pth')
del dls
dls = torch.load('export/mixed_dls.pth')
dls.train.show_batch()
xb, yb = first(dls.train)
xb
xs, ys = first(dls.train)
xs[0][0].shape, xs[0][1].shape, xs[1][0].shape, xs[1][1].shape
from tsai.data.validation import TimeSplitter
from tsai.data.core import TSRegression, get_ts_dls
X = np.repeat(np.repeat(np.arange(8)[:, None, None], 2, 1), 5, 2).astype(float)
X = np.concatenate([X, X])
y = np.concatenate([np.arange(len(X)//2)]*2)
alphabet = np.array(list(string.ascii_lowercase))
# y = alphabet[y]
splits = TimeSplitter(.5, show_plot=False)(range_of(X))
tfms = [None, TSRegression()]
dls1 = get_ts_dls(X, y, splits=splits, tfms=tfms)
dls1.one_batch()
data = np.concatenate([np.repeat(np.arange(8)[:, None], 3, 1)*np.array([1, 10, 100])]*2)
df = pd.DataFrame(data, columns=['cat1', 'cat2', 'cont'])
df['cont'] = df['cont'].astype(float)
df['target'] = y
cat_names = ['cat1', 'cat2']
cont_names = ['cont']
target = 'target'
dls2 = get_tabular_dls(df, procs=[Categorify, FillMissing, #Normalize
], cat_names=cat_names, cont_names=cont_names, y_names=target, splits=splits, bs=8)
dls2.one_batch()
z = zip(_loaders[dls1.train.fake_l.num_workers == 0](dls1.train.fake_l))
for b in z:
print(b)
break
bs = 8
dls = get_mixed_dls(dls1, dls2, bs=bs)
dl = dls.train
xb, yb = dl.one_batch()
test_eq(len(xb), 2)
test_eq(len(xb[0]), bs)
test_eq(len(xb[1]), 2)
test_eq(len(xb[1][0]), bs)
test_eq(len(xb[1][1]), bs)
test_eq(xb[0].data[:, 0, 0].long(), xb[1][0][:, 0] - 1) # categorical data and ts are in synch
test_eq(xb[0].data[:, 0, 0], (xb[1][1]/100).flatten()) # continuous data and ts are in synch
test_eq(tensor(dl.input_idxs), yb.long().cpu())
dl = dls.valid
xb, yb = dl.one_batch()
test_eq(tensor(y[dl.input_idxs]), yb.long().cpu())