model

`DataLoader` ¶

Load data (train, test)

Parameters:

Name	Type	Description	Default
`config`		config file path	required

Usage:

from techlandscape.model import DataLoader
from techlandscape.utils import get_config

cfg = get_config("configs/model_cnn/default.yaml")
cfg.update({"data": {"train": "your-train.jsonl", "test": "your-test.jsonl"}, "out": "your-save-dir"})

data_loader = DataLoader(cfg)
data_loader.load()

# check examples
data_loader.text_train

`load(self)` ¶

Load data. Expect a jsonl file where each row at least two fields: 'text' and 'is_seed'.

Source code in techlandscape/model.py

def load(self):
    """Load data. Expect a jsonl file where each row at least two fields: 'text' and 'is_seed'."""
    if any(
        map(
            lambda x: x is None,
            [self.text_train, self.text_test, self.y_train, self.y_test],
        )
    ):
        self.text_train = self._get_data(self.train_path, "text")
        self.text_test = self._get_data(self.test_path, "text")
        self.y_train = np.array(self._get_data(self.train_path, "is_seed")).astype(
            int
        )
        try:
            self.y_test = np.array(
                self._get_data(self.test_path, "is_seed")
            ).astype(int)
        except KeyError:
            typer.secho(
                "No output variable in test. You can still vectorize the data.",
                color=typer.colors.YELLOW,
            )
            pass
        typer.secho(f"{ok}Data loaded", color=typer.colors.GREEN)
    else:
        typer.secho("Data already populated", color=typer.colors.YELLOW)
    typer.secho(
        f"{ok}{len(self.text_train)} examples loaded in training set",
        color=typer.colors.BLUE,
    )
    typer.secho(
        f"{ok}{len(self.text_test)} examples loaded in test set",
        color=typer.colors.BLUE,
    )

`Model` ¶

Main model class (data + model architecture + training)

Parameters:

Name	Type	Description	Default
`config`		config	required
`filepath`		saving model directory	required

Usage:

from techlandscape.model import Model
from techlandscape.utils import get_config

cfg = get_config("configs/model_cnn/default.yaml")
cfg.update({"data": {"train": "your-train.jsonl", "test": "your-test.jsonl"}, "out": "your-save-dir"})

model = Model(cfg)
model.fit()
model.save()

`ModelBuilder` ¶

Build model

Parameters:

Name	Type	Description	Default
`config`		config file path	required

Usage:

from techlandscape.model import ModelBuilder
from techlandscape.utils import get_config

cfg = get_config("configs/model_cnn/default.yaml")
cfg.update({"data": {"train": "your-train.jsonl", "test": "your-test.jsonl"}, "out": "your-save-dir"})

model_builder = ModelBuilder(cfg)
model_builder.build()

# check model
model_builder.model.summary()

Resources

Google ML Guide on text-classification Keras text-classification Understanding CNN

`build(self, embedding_matrix=None)` ¶

Instantiate model based on config file

Source code in techlandscape/model.py

def build(self, embedding_matrix: dict = None):
    """
    Instantiate model based on config file
    """
    assert self.cfg["model"]["architecture"] in SupportedModels._member_names_
    if not self.model:
        if self.cfg["model"]["architecture"] == SupportedModels.mlp.value:
            self._build_mlp()
        elif self.cfg["model"]["architecture"] == SupportedModels.cnn.value:
            # TODO handle embedding matrix properly
            self._build_cnn(embedding_matrix)
        else:
            raise UnknownModel(UNKNOWN_MODEL_MSG)
        typer.secho(
            f"{ok}Model built (see self.model.summary() for details)",
            color=typer.colors.GREEN,
        )
    else:
        typer.secho("Model already built", color=typer.colors.YELLOW)

`ModelCompiler` ¶

Compile model

Parameters:

Name	Type	Description	Default
`config`		config file path	required

Usage:

from techlandscape.model import ModelCompiler
from techlandscape.utils import get_config

cfg = get_config("configs/model_cnn/default.yaml")
cfg.update({"data": {"train": "your-train.jsonl", "test": "your-test.jsonl"}, "out": "your-save-dir"})

model_compiler = ModelCompiler(cfg)
model_compiler.compile()

# check model, e.g. loss
model_compiler.model.loss

`compile(self, embedding_matrix=None)` ¶

Compile model. Use config file to instantiate training components.

Source code in techlandscape/model.py

def compile(self, embedding_matrix: dict = None):
    """Compile model. Use config file to instantiate training components."""
    self.build(embedding_matrix)
    self.optimizer = Adam(lr=float(self.cfg["model"]["optimizer"]["learning_rate"]))
    self.model.compile(
        optimizer=self.optimizer,
        loss=self.cfg["model"]["optimizer"]["loss"],
        metrics=[
            tf.keras.metrics.BinaryAccuracy(),
            tf.keras.metrics.Precision(),
            tf.keras.metrics.Recall(),
        ],
    )
    typer.secho(
        f"{ok}Model compiled (see self.model.summary() for details)",
        color=typer.colors.GREEN,
    )

`ModelFitter` ¶

Fit model

Parameters:

Name	Type	Description	Default
`config`		config file path	required

Usage:

from techlandscape.model import ModelFitter
from techlandscape.utils import get_config

cfg = get_config("configs/model_cnn/default.yaml")
cfg.update({"data": {"train": "your-train.jsonl", "test": "your-test.jsonl"}, "out": "your-save-dir"})

model_fitter = ModelFitter(cfg)
model_fitter.fit()

# check model, e.g. history
model_fitter.model.history

`fit(self)` ¶

Fit model

Source code in techlandscape/model.py

def fit(self):
    """Fit model"""
    self.compile()
    if self.cfg["training"]["callbacks"]["early_stopping"]["active"]:
        self.callbacks += [
            EarlyStopping(
                monitor=self.cfg["training"]["callbacks"]["early_stopping"][
                    "monitor"
                ],
                patience=self.cfg["training"]["callbacks"]["early_stopping"][
                    "patience"
                ],
                restore_best_weights=True,
            )
        ]
    if self.cfg["training"]["callbacks"]["save_best_only"]["active"]:
        self.filepath_best = (
            get_project_root() / Path(self.cfg["out"]) / Path("model-best")
        )
        self.callbacks += [
            tf.keras.callbacks.ModelCheckpoint(
                filepath=self.filepath_best,
                monitor=self.cfg["training"]["callbacks"]["save_best_only"][
                    "monitor"
                ],
                save_best_only=True,
                verbose=self.cfg["training"]["callbacks"]["save_best_only"][
                    "verbose"
                ],
            )
        ]
    if self.cfg["logger"]["tensorboard"]["active"]:
        self.logdir = get_project_root() / Path(
            self.cfg["logger"]["tensorboard"]["logdir"]
        )
        self.callbacks += [tf.keras.callbacks.TensorBoard(self.logdir)]

    if not self.model.history:
        # if self.model_architecture == "mlp":
        #     self.x_train = tf.sparse.reorder(self._convert_sparse_matrix_to_sparse_tensor(self.x_train))
        #     self.x_test = tf.sparse.reorder(self._convert_sparse_matrix_to_sparse_tensor(self.x_test))

        self.model.fit(
            self.x_train,
            self.y_train,
            epochs=self.cfg["training"]["epochs"],
            callbacks=self.callbacks,
            validation_data=(self.x_test, self.y_test),
            verbose=self.cfg["logger"]["verbose"],
            batch_size=self.cfg["training"]["batch_size"],
        )
        typer.secho(f"{ok}Model trained", color=typer.colors.GREEN)
    else:
        # Alternative: clear session (keras.backend.clear_session()) and retrain
        typer.secho(f"Model already trained", color=typer.colors.YELLOW)

`TextVectorizer` ¶

Vectorize data

Parameters:

Name	Type	Description	Default
`config`		config file path	required

Usage:

from techlandscape.model import TextVectorizer
from techlandscape.utils import get_config

cfg = get_config("configs/model_cnn/default.yaml")
cfg.update({"data": {"train": "your-train.jsonl", "test": "your-test.jsonl"}, "out": "your-save-dir"})

text_loader = TextVectorizer(cfg)
text_loader.vectorize()

# check examples
text_loader.x_train

`vectorize(self)` ¶

Return vectorized texts (train and test)

Source code in techlandscape/model.py

def vectorize(self):
    """Return vectorized texts (train and test)"""
    if self.model_architecture == "cnn":
        self._get_sequences()
    elif self.model_architecture == "mlp":
        self._get_ngrams()
    else:
        raise UnknownModel(UNKNOWN_MODEL_MSG)
    typer.secho(f"{ok}Text vectorized", color=typer.colors.GREEN)

`train(cfg)` ¶

Train and save mode

Source code in techlandscape/model.py

@hydra.main(config_path="../configs")
def train(cfg: DictConfig) -> None:
    """
    Train and save mode
    """
    model = Model(config=cfg)
    model.fit()
    model.save_meta()
    model.save_config()

model

DataLoader ¶

load(self) ¶

Model ¶

ModelBuilder ¶

build(self, embedding_matrix=None) ¶

ModelCompiler ¶

compile(self, embedding_matrix=None) ¶

ModelFitter ¶

fit(self) ¶

TextVectorizer ¶

vectorize(self) ¶

train(cfg) ¶

`DataLoader` ¶

`load(self)` ¶

`Model` ¶

`ModelBuilder` ¶

`build(self, embedding_matrix=None)` ¶

`ModelCompiler` ¶

`compile(self, embedding_matrix=None)` ¶

`ModelFitter` ¶

`fit(self)` ¶

`TextVectorizer` ¶

`vectorize(self)` ¶

`train(cfg)` ¶