Skip to content

io

get_expansion(primary_key, table_ref, destination_table, credentials, sample_size=None, verbose=False)

Return (a sample of) the expansion table

Parameters:

Name Type Description Default
primary_key PrimaryKey

table primary key

required
table_ref str

expansion table

required
destination_table str

destination table

required
credentials Path

credentials file path

required
sample_size int

size of the sample (if None, then we extract all)

None
verbose bool

verbosity

False

Usage:

techlandscape io get-expansion family_id <table-ref> <destination-table> credentials_bq.json --sample-size 10000

Source code in techlandscape/io.py
@monitor
@app.command()
def get_expansion(
    primary_key: PrimaryKey,
    table_ref: str,
    destination_table: str,
    credentials: Path,
    sample_size: int = None,
    verbose: bool = False,
) -> None:
    """
    Return (a sample of) the expansion table

    Arguments:
        primary_key: table primary key
        table_ref: expansion table
        destination_table: destination table
        credentials: credentials file path
        sample_size: size of the sample (if None, then we extract all)
        verbose: verbosity

    **Usage:**
        ```
        techlandscape io get-expansion family_id <table-ref> <destination-table> credentials_bq.json --sample-size 10000
        ```
    """
    project_id = get_project_id(primary_key, credentials)
    sample_clause = f"LIMIT {sample_size}" if sample_size else ""

    query = f"""
    WITH
      expansion AS (
      SELECT
        *
      FROM
        `{table_ref}` AS expansion  # patentcity.techdiffusion.expansion_additivemanufacturing
      WHERE
        expansion_level NOT LIKE "%SEED%" )
    SELECT
      expansion.*,
      p.abstract AS text
    FROM
      expansion
    LEFT JOIN
      `{project_id}.patents.publications` AS p
    ON
      expansion.{primary_key.value} = p.{primary_key.value}
    WHERE
      p.abstract IS NOT NULL
      AND p.abstract != ""
      AND p.abstract != "\\n"
      AND p.abstract != "\\n\\n"
    ORDER BY RAND()  
    {sample_clause}
    """
    get_bq_job_done(query, destination_table, credentials, verbose=verbose)

get_training_data(primary_key, seed_table, expansion_table, af_antiseed_size, destination_table, credentials, verbose=False)

Return training data

Parameters:

Name Type Description Default
primary_key PrimaryKey

table primary key

required
seed_table str

seed table (project.dataset.table)

required
expansion_table str

expansion table used for drawing the antiseed a la AF (project.dataset.table)

required
af_antiseed_size int

size of the antiseed a la AF

required
destination_table str

destination table

required
credentials Path

BQ credentials file path

required
verbose bool

verbosity

False

Usage:

techlandscape io get-training-data family_id <seed-table> <expansion-table> <af-seed-size> <destination-table> credentials_bq.json

Source code in techlandscape/io.py
@app.command()
def get_training_data(
    primary_key: PrimaryKey,
    seed_table: str,
    expansion_table: str,
    af_antiseed_size: int,
    destination_table: str,
    credentials: Path,
    verbose: bool = False,
) -> None:
    """
    Return training data

    Arguments:
        primary_key: table primary key
        seed_table: seed table (project.dataset.table)
        expansion_table: expansion table used for drawing the antiseed a la AF (project.dataset.table)
        af_antiseed_size: size of the antiseed a la AF
        destination_table: destination table
        credentials: BQ credentials file path
        verbose: verbosity

    **Usage:**
        ```shell
        techlandscape io get-training-data family_id <seed-table> <expansion-table> <af-seed-size> <destination-table> credentials_bq.json
        ```
    """
    project_id = get_project_id(primary_key, credentials)
    query = f"""
    WITH
      manual AS (
      SELECT
        {primary_key.value},
        expansion_level,
        CAST(expansion_level="SEED" AS INT64) AS is_seed,
        STRUCT(CAST(expansion_level="SEED" AS INT64) AS SEED,
          CAST(expansion_level LIKE "ANTISEED%" AS INT64) AS NOT_SEED) AS cats,
        text
      FROM
        `{seed_table}`
      WHERE
        expansion_level LIKE "%SEED%"),
      random AS (
    SELECT
      r.{primary_key.value} AS {primary_key.value},
      "ANTISEED-AF" AS expansion_level,
      CAST(0 AS INT64) AS is_seed,
      STRUCT(0 AS SEED,
        1 AS NOT_SEED) AS cats,
      abstract AS text
    FROM
      `{project_id}.patents.publications` AS r #country_prefix
    LEFT OUTER JOIN
      `{expansion_table}` AS tmp
    ON
      r.{primary_key.value} = tmp.{primary_key.value}
    WHERE
      abstract IS NOT NULL
      AND abstract != ""
    ORDER BY
      RAND()
    LIMIT
      {af_antiseed_size}
      )
    SELECT * FROM manual
    UNION ALL
    SELECT * FROM  random 
    """
    get_bq_job_done(query, destination_table, credentials, verbose=verbose)