model
DataLoader
¶
Load data (train, test)
Parameters:
Name | Type | Description | Default |
---|---|---|---|
config |
|
config file path |
required |
Usage:
from techlandscape.model import DataLoader
from techlandscape.utils import get_config
cfg = get_config("configs/model_cnn/default.yaml")
cfg.update({"data": {"train": "your-train.jsonl", "test": "your-test.jsonl"}, "out": "your-save-dir"})
data_loader = DataLoader(cfg)
data_loader.load()
# check examples
data_loader.text_train
load(self)
¶
Load data. Expect a jsonl file where each row at least two fields: 'text' and 'is_seed'.
Source code in techlandscape/model.py
def load(self):
"""Load data. Expect a jsonl file where each row at least two fields: 'text' and 'is_seed'."""
if any(
map(
lambda x: x is None,
[self.text_train, self.text_test, self.y_train, self.y_test],
)
):
self.text_train = self._get_data(self.train_path, "text")
self.text_test = self._get_data(self.test_path, "text")
self.y_train = np.array(self._get_data(self.train_path, "is_seed")).astype(
int
)
try:
self.y_test = np.array(
self._get_data(self.test_path, "is_seed")
).astype(int)
except KeyError:
typer.secho(
"No output variable in test. You can still vectorize the data.",
color=typer.colors.YELLOW,
)
pass
typer.secho(f"{ok}Data loaded", color=typer.colors.GREEN)
else:
typer.secho("Data already populated", color=typer.colors.YELLOW)
typer.secho(
f"{ok}{len(self.text_train)} examples loaded in training set",
color=typer.colors.BLUE,
)
typer.secho(
f"{ok}{len(self.text_test)} examples loaded in test set",
color=typer.colors.BLUE,
)
Model
¶
Main model class (data + model architecture + training)
Parameters:
Name | Type | Description | Default |
---|---|---|---|
config |
|
config |
required |
filepath |
|
saving model directory |
required |
Usage:
from techlandscape.model import Model
from techlandscape.utils import get_config
cfg = get_config("configs/model_cnn/default.yaml")
cfg.update({"data": {"train": "your-train.jsonl", "test": "your-test.jsonl"}, "out": "your-save-dir"})
model = Model(cfg)
model.fit()
model.save()
ModelBuilder
¶
Build model
Parameters:
Name | Type | Description | Default |
---|---|---|---|
config |
|
config file path |
required |
Usage:
from techlandscape.model import ModelBuilder
from techlandscape.utils import get_config
cfg = get_config("configs/model_cnn/default.yaml")
cfg.update({"data": {"train": "your-train.jsonl", "test": "your-test.jsonl"}, "out": "your-save-dir"})
model_builder = ModelBuilder(cfg)
model_builder.build()
# check model
model_builder.model.summary()
build(self, embedding_matrix=None)
¶
Instantiate model based on config file
Source code in techlandscape/model.py
def build(self, embedding_matrix: dict = None):
"""
Instantiate model based on config file
"""
assert self.cfg["model"]["architecture"] in SupportedModels._member_names_
if not self.model:
if self.cfg["model"]["architecture"] == SupportedModels.mlp.value:
self._build_mlp()
elif self.cfg["model"]["architecture"] == SupportedModels.cnn.value:
# TODO handle embedding matrix properly
self._build_cnn(embedding_matrix)
else:
raise UnknownModel(UNKNOWN_MODEL_MSG)
typer.secho(
f"{ok}Model built (see self.model.summary() for details)",
color=typer.colors.GREEN,
)
else:
typer.secho("Model already built", color=typer.colors.YELLOW)
ModelCompiler
¶
Compile model
Parameters:
Name | Type | Description | Default |
---|---|---|---|
config |
|
config file path |
required |
Usage:
from techlandscape.model import ModelCompiler
from techlandscape.utils import get_config
cfg = get_config("configs/model_cnn/default.yaml")
cfg.update({"data": {"train": "your-train.jsonl", "test": "your-test.jsonl"}, "out": "your-save-dir"})
model_compiler = ModelCompiler(cfg)
model_compiler.compile()
# check model, e.g. loss
model_compiler.model.loss
compile(self, embedding_matrix=None)
¶
Compile model. Use config file to instantiate training components.
Source code in techlandscape/model.py
def compile(self, embedding_matrix: dict = None):
"""Compile model. Use config file to instantiate training components."""
self.build(embedding_matrix)
self.optimizer = Adam(lr=float(self.cfg["model"]["optimizer"]["learning_rate"]))
self.model.compile(
optimizer=self.optimizer,
loss=self.cfg["model"]["optimizer"]["loss"],
metrics=[
tf.keras.metrics.BinaryAccuracy(),
tf.keras.metrics.Precision(),
tf.keras.metrics.Recall(),
],
)
typer.secho(
f"{ok}Model compiled (see self.model.summary() for details)",
color=typer.colors.GREEN,
)
ModelFitter
¶
Fit model
Parameters:
Name | Type | Description | Default |
---|---|---|---|
config |
|
config file path |
required |
Usage:
from techlandscape.model import ModelFitter
from techlandscape.utils import get_config
cfg = get_config("configs/model_cnn/default.yaml")
cfg.update({"data": {"train": "your-train.jsonl", "test": "your-test.jsonl"}, "out": "your-save-dir"})
model_fitter = ModelFitter(cfg)
model_fitter.fit()
# check model, e.g. history
model_fitter.model.history
fit(self)
¶
Fit model
Source code in techlandscape/model.py
def fit(self):
"""Fit model"""
self.compile()
if self.cfg["training"]["callbacks"]["early_stopping"]["active"]:
self.callbacks += [
EarlyStopping(
monitor=self.cfg["training"]["callbacks"]["early_stopping"][
"monitor"
],
patience=self.cfg["training"]["callbacks"]["early_stopping"][
"patience"
],
restore_best_weights=True,
)
]
if self.cfg["training"]["callbacks"]["save_best_only"]["active"]:
self.filepath_best = (
get_project_root() / Path(self.cfg["out"]) / Path("model-best")
)
self.callbacks += [
tf.keras.callbacks.ModelCheckpoint(
filepath=self.filepath_best,
monitor=self.cfg["training"]["callbacks"]["save_best_only"][
"monitor"
],
save_best_only=True,
verbose=self.cfg["training"]["callbacks"]["save_best_only"][
"verbose"
],
)
]
if self.cfg["logger"]["tensorboard"]["active"]:
self.logdir = get_project_root() / Path(
self.cfg["logger"]["tensorboard"]["logdir"]
)
self.callbacks += [tf.keras.callbacks.TensorBoard(self.logdir)]
if not self.model.history:
# if self.model_architecture == "mlp":
# self.x_train = tf.sparse.reorder(self._convert_sparse_matrix_to_sparse_tensor(self.x_train))
# self.x_test = tf.sparse.reorder(self._convert_sparse_matrix_to_sparse_tensor(self.x_test))
self.model.fit(
self.x_train,
self.y_train,
epochs=self.cfg["training"]["epochs"],
callbacks=self.callbacks,
validation_data=(self.x_test, self.y_test),
verbose=self.cfg["logger"]["verbose"],
batch_size=self.cfg["training"]["batch_size"],
)
typer.secho(f"{ok}Model trained", color=typer.colors.GREEN)
else:
# Alternative: clear session (keras.backend.clear_session()) and retrain
typer.secho(f"Model already trained", color=typer.colors.YELLOW)
TextVectorizer
¶
Vectorize data
Parameters:
Name | Type | Description | Default |
---|---|---|---|
config |
|
config file path |
required |
Usage:
from techlandscape.model import TextVectorizer
from techlandscape.utils import get_config
cfg = get_config("configs/model_cnn/default.yaml")
cfg.update({"data": {"train": "your-train.jsonl", "test": "your-test.jsonl"}, "out": "your-save-dir"})
text_loader = TextVectorizer(cfg)
text_loader.vectorize()
# check examples
text_loader.x_train
vectorize(self)
¶
Return vectorized texts (train and test)
Source code in techlandscape/model.py
def vectorize(self):
"""Return vectorized texts (train and test)"""
if self.model_architecture == "cnn":
self._get_sequences()
elif self.model_architecture == "mlp":
self._get_ngrams()
else:
raise UnknownModel(UNKNOWN_MODEL_MSG)
typer.secho(f"{ok}Text vectorized", color=typer.colors.GREEN)
train(cfg)
¶
Train and save mode
Source code in techlandscape/model.py
@hydra.main(config_path="../configs")
def train(cfg: DictConfig) -> None:
"""
Train and save mode
"""
model = Model(config=cfg)
model.fit()
model.save_meta()
model.save_config()