Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions src/dryml/examples/sentiment_analysis/init.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
from dryml.examples.sentiment_analysis.workshop import SentimentWorkshop
from dryml.examples.sentiment_analysis.torch.models import SentimentTorchModel

__all__ = [
"SentimentWorkshop",
"SentimentTorchModel"
]
26 changes: 26 additions & 0 deletions src/dryml/examples/sentiment_analysis/torch/models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
import torch
import torch.nn as nn
import torch.nn.functional as F
from dryml.models.torch.generic import ModelWrapper

class SentimentModelTorch(nn.Module):
def __init__(self, vocab_size=10000, max_length=250, embedding_dim=32):
super().__init__()
self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)
self.conv = nn.Conv2d(in_channels=1, out_channels=32, kernel_size=(5, embedding_dim))
self.fc1 = nn.Linear(32, 32)
self.dropout = nn.Dropout(0.3)
self.fc2 = nn.Linear(32, 1)

def forward(self, x):
x = self.embedding(x)
x = x.unsqueeze(1)
x = F.relu(self.conv(x))
x = torch.max(x, dim=2).values
x = x.view(x.size(0), -1)
x = F.relu(self.fc1(x))
x = self.dropout(x)
x = torch.sigmoid(self.fc2(x))
return x.squeeze(1)

SentimentTorchModel = ModelWrapper(SentimentModelTorch, vocab_size=10000, max_length=250, embedding_dim=32)
45 changes: 45 additions & 0 deletions src/dryml/examples/sentiment_analysis/train.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import dryml
from dryml import Repo

dryml.context.set_context({
'default': {},
'torch': {'gpu/0': 1.},
'tf': {}
})

from dryml.models import Pipe

from dryml.examples.sentiment_analysis.workshop import train as train_model

if __name__ == "__main__":
from dryml.models.torch.text import TextVectorizer
from dryml.examples.sentiment_analysis.torch.models import SentimentTorchModel
from dryml.models.torch.generic import BasicTraining, TorchOptimizer, Trainable as TorchTrainable
from dryml.models.torch.generic import Wrapper
import torch.nn as nn
import torch

torch_vectorizer = TextVectorizer(max_tokens=10000, sequence_length=250, dry_id="imdb_vectorizer")

torch_optimizer = TorchOptimizer(torch.optim.Adam, SentimentTorchModel, lr=0.001)
loss_fn = Wrapper(nn.BCELoss)

sentiment_torch_trainable = TorchTrainable(
model=SentimentTorchModel,
train_fn=BasicTraining(epochs=10, optimizer=torch_optimizer, loss=loss_fn)
)

from dryml.data.torch.transforms import TorchDevice
from dryml.context import context

dev = context().get_torch_devices()[0]

torch_pipe = Pipe(torch_vectorizer, TorchDevice(device=dev), sentiment_torch_trainable)
trained_torch_pipe = train_model(torch_pipe)

repo = Repo(directory="sentiment_models", create=True)
repo.add_object(trained_torch_pipe, add_nested=False)
repo.save()


print("Torch model and all contained sub-objects saved under ./sentiment_models/")
46 changes: 46 additions & 0 deletions src/dryml/examples/sentiment_analysis/workshop.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import dryml
import tensorflow_datasets as tfds
import torch
from torch.utils.data import Dataset
from dryml.data.torch import TorchDataset
from dryml.models.torch.text import TextVectorizer
from dryml.models import Pipe
from dryml.metrics.scalar import binary_accuracy

class IMDBTorchDataset(Dataset):
def __init__(self, data):
self.data = data

def __len__(self):
return len(self.data)

def __getitem__(self, idx):
text, label = self.data[idx]
return text.decode('utf-8'), torch.tensor(float(label), dtype=torch.float)

class SentimentWorkshop(dryml.Workshop):
def data_prep(self):
(ds_train, ds_test), ds_info = tfds.load(
'imdb_reviews',
split=['train', 'test'],
as_supervised=True,
with_info=True
)
train_data = list(ds_train.as_numpy_iterator())
test_data = list(ds_test.as_numpy_iterator())

train_torch_ds = IMDBTorchDataset(train_data)
test_torch_ds = IMDBTorchDataset(test_data)
self.train_ds = TorchDataset(train_torch_ds, supervised=True)
self.test_ds = TorchDataset(test_torch_ds, supervised=True)

def train(trainable):
ws = SentimentWorkshop()
ws.data_prep()

trainable.prep_train()

trainable.train(ws.train_ds)
return trainable


77 changes: 77 additions & 0 deletions src/dryml/models/torch/text.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
from dryml.models.torch.generic import Trainable
import torch
from collections import Counter
import re
import numpy as np

# Text Vectorizer for Sentiment Analysis Model
class TextVectorizer(Trainable):
def __init__(self, max_tokens=10000, sequence_length=250, pad_token="<PAD>", unk_token="<UNK>"):
super().__init__()
self.max_tokens = max_tokens
self.sequence_length = sequence_length
self.pad_token = pad_token
self.unk_token = unk_token

self.vocab = {}
self.id_to_token = []
self.trained = False

def simple_tokenize(self, text):
return re.findall(r"\w+", text.lower())

def build_vocab(self, dataset):
counter = Counter()
for x, _ in dataset:
tokens = self.simple_tokenize(x)
counter.update(tokens)
most_common = counter.most_common(self.max_tokens - 2)
self.id_to_token = [self.pad_token, self.unk_token] + [w for w, _ in most_common]
self.vocab = {w: i for i, w in enumerate(self.id_to_token)}

def vectorize_text(self, text):
tokens = self.simple_tokenize(text)
ids = []
for t in tokens:
ids.append(self.vocab.get(t, self.vocab[self.unk_token]))
if len(ids) < self.sequence_length:
ids += [self.vocab[self.pad_token]] * (self.sequence_length - len(ids))
else:
ids = ids[:self.sequence_length]
return np.array(ids, dtype=np.int64)

def train(self, ds, **kwargs):
if not self.trained:
self.build_vocab(ds)
self.trained = True

def predict(self, x, **kwargs):
if isinstance(x, str):
return torch.tensor(self.vectorize_text(x), dtype=torch.long)
elif isinstance(x, np.ndarray):
return self.predict(x.tolist(), **kwargs)
elif isinstance(x, (list, tuple)):
return torch.stack([
torch.tensor(self.vectorize_text(xx), dtype=torch.long)
for xx in x
])
raise ValueError("Unsupported input type for vectorizer.")


def eval(self, dataset, **kwargs):
return dataset.map(lambda x: (self.predict(x[0]), x[1]))

def prep_train(self, **kwargs):
pass

def save_model(self, *args, **kwargs):
return {
'vocab': self.vocab,
'id_to_token': self.id_to_token,
'trained': self.trained
}

def load_model(self, dct, *args, **kwargs):
self.vocab = dct['vocab']
self.id_to_token = dct['id_to_token']
self.trained = dct['trained']