expansion
get_citation_expansion(primary_key, citation_kind, expansion_level, table_ref, destination_table, credentials, **kwargs)
¶
Expand "along" the citation dimension, either backward or forward
Parameters:
Name | Type | Description | Default |
---|---|---|---|
primary_key |
PrimaryKey |
table primary key |
required |
citation_kind |
CitationKind |
kind of citations |
required |
expansion_level |
CitationExpansionLevel |
expansion level |
required |
table_ref |
str |
expansion table (project.dataset.table) |
required |
destination_table |
str |
query results destination table (project.dataset.table) |
required |
credentials |
Path |
BQ credentials file path |
required |
**kwargs |
|
key worded args passed to bigquery.QueryJobConfig |
{} |
Usage:
Source code in techlandscape/expansion.py
def get_citation_expansion(
primary_key: PrimaryKey,
citation_kind: CitationKind,
expansion_level: CitationExpansionLevel,
table_ref: str,
destination_table: str,
credentials: Path,
**kwargs,
):
"""
Expand "along" the citation dimension, either backward or forward
Arguments:
primary_key: table primary key
citation_kind: kind of citations
expansion_level: expansion level
table_ref: expansion table (project.dataset.table)
destination_table: query results destination table (project.dataset.table)
credentials: BQ credentials file path
**kwargs: key worded args passed to bigquery.QueryJobConfig
**Usage:**
"""
dataset = (
"google_patents_research"
if citation_kind.value == CitationKind.forward.value
and primary_key.value == PrimaryKey.publication_number.value
else "patents"
)
expansion_level_clause = (
'AND expansion_level in ("SEED", "PC")'
if expansion_level.value == CitationExpansionLevel.L1.value
else 'AND expansion_level LIKE "L1%"'
)
project_id_ = get_project_id(primary_key, credentials)
# query_suffix = (
# ""
# if primary_key.value == PrimaryKey.publication_number.value
# else f""", UNNEST({primary_key.value}) as {primary_key.value}"""
# )
cit_var = (
"citation" if citation_kind.value == CitationKind.backward.value else "cited_by"
)
query = f"""
WITH expansion AS(
SELECT
DISTINCT cit.{primary_key.value}
FROM
`{project_id_}.{dataset}.publications` AS p,
{table_ref} AS tmp,
UNNEST({cit_var}) AS cit
WHERE
p.{primary_key.value}=tmp.{primary_key.value}
AND cit.{primary_key.value} IS NOT NULL
{expansion_level_clause}
)
SELECT
DISTINCT({primary_key.value}),
"{'-'.join([expansion_level.value, citation_kind.value.upper()])}" AS expansion_level
FROM
expansion
"""
# {query_suffix}
get_bq_job_done(query, destination_table, credentials, **kwargs)
get_expansion(primary_key, tech_class, table_ref, destination_table, staging_dataset, credentials, random_share=None, precomputed=False, n_pc=50)
¶
Expand along PC and citations (L1 + L2)
Parameters:
Name | Type | Description | Default |
---|---|---|---|
primary_key |
PrimaryKey |
table primary key |
required |
tech_class |
TechClass |
technological class considered |
required |
table_ref |
str |
seed table (project.dataset.table) |
required |
destination_table |
str |
query results destination table (project.dataset.table) |
required |
staging_dataset |
str |
intermediary table staging dataset (project.dataset) |
required |
credentials |
Path |
BQ credentials file path |
required |
random_share |
float |
share of the seed randomly drawn and used for the expansion |
None |
precomputed |
bool |
True if pc odds pre-computed |
False |
n_pc |
int |
nb most important pc for pc expansion |
50 |
Usage:
TECH="additivemanufacturing"
techlandscape expansion get-expansion family_id cpc patentcity.techdiffusion.seed_${TECH} patentcity.techdiffusion.expansion_${TECH} patentcity.stage credentials_bq.json
Source code in techlandscape/expansion.py
@app.command()
def get_expansion(
primary_key: PrimaryKey,
tech_class: TechClass,
table_ref: str,
destination_table: str,
staging_dataset: str,
credentials: Path,
random_share: float = None,
precomputed: bool = False,
n_pc: int = 50,
):
"""
Expand along PC and citations (L1 + L2)
Arguments:
primary_key: table primary key
tech_class: technological class considered
table_ref: seed table (project.dataset.table)
destination_table: query results destination table (project.dataset.table)
staging_dataset: intermediary table staging dataset (project.dataset)
credentials: BQ credentials file path
random_share: share of the seed randomly drawn and used for the expansion
precomputed: True if pc odds pre-computed
n_pc: nb most important pc for pc expansion
**Usage:**
```shell
TECH="additivemanufacturing"
techlandscape expansion get-expansion family_id cpc patentcity.techdiffusion.seed_${TECH} patentcity.techdiffusion.expansion_${TECH} patentcity.stage credentials_bq.json
```
"""
get_seed(
primary_key,
table_ref,
destination_table,
credentials,
random_share,
write_disposition="WRITE_TRUNCATE",
verbose=False,
)
get_full_pc_expansion(
primary_key,
tech_class,
n_pc,
destination_table,
destination_table,
staging_dataset,
credentials,
precomputed,
write_disposition="WRITE_APPEND",
verbose=False,
)
get_full_citation_expansion(
primary_key,
CitationExpansionLevel.L1,
destination_table,
destination_table,
credentials,
write_disposition="WRITE_APPEND",
verbose=False,
)
get_full_citation_expansion(
primary_key,
CitationExpansionLevel.L2,
destination_table,
destination_table,
credentials,
write_disposition="WRITE_APPEND",
verbose=False,
)
get_reduced_expansion(
primary_key,
destination_table,
destination_table,
credentials,
write_disposition="WRITE_TRUNCATE",
verbose=False,
)
get_full_citation_expansion(primary_key, expansion_level, table_ref, destination_table, credentials, **kwargs)
¶
Expand along the citation level, both backward and forward
Parameters:
Name | Type | Description | Default |
---|---|---|---|
primary_key |
PrimaryKey |
table primary key |
required |
expansion_level |
CitationExpansionLevel |
expansion level |
required |
table_ref |
str |
expansion table (project.dataset.table) |
required |
destination_table |
str |
query results destination table (project.dataset.table) |
required |
credentials |
Path |
BQ credentials file path |
required |
**kwargs |
|
key worded args passed to bigquery.QueryJobConfig |
{} |
Usage:
Source code in techlandscape/expansion.py
def get_full_citation_expansion(
primary_key: PrimaryKey,
expansion_level: CitationExpansionLevel,
table_ref: str,
destination_table: str,
credentials: Path,
**kwargs,
):
"""Expand along the citation level, both backward and forward
Arguments:
primary_key: table primary key
expansion_level: expansion level
table_ref: expansion table (project.dataset.table)
destination_table: query results destination table (project.dataset.table)
credentials: BQ credentials file path
**kwargs: key worded args passed to bigquery.QueryJobConfig
**Usage:**
"""
get_citation_expansion(
primary_key,
CitationKind.backward,
expansion_level,
table_ref,
destination_table,
credentials,
**kwargs,
)
get_citation_expansion(
primary_key,
CitationKind.forward,
expansion_level,
table_ref,
destination_table,
credentials,
**kwargs,
)
get_full_pc_expansion(primary_key, tech_class, n_pc, table_ref, destination_table, staging_dataset, credentials, precomputed=False, **kwargs)
¶
Compute (or use precomputed) seed pc odds and expand along the pc dimension.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
primary_key |
PrimaryKey |
table primary key |
required |
tech_class |
TechClass |
technological class considered |
required |
n_pc |
int |
nb most important pc for pc expansion |
required |
table_ref |
str |
expansion table (project.dataset.table) |
required |
destination_table |
str |
query results destination table (project.dataset.table) |
required |
staging_dataset |
str |
intermediary table staging dataset (project.dataset) |
required |
credentials |
Path |
BQ credentials file path |
required |
precomputed |
bool |
True if pc odds pre-computed |
False |
**kwargs |
|
key worded args passed to bigquery.QueryJobConfig |
{} |
Usage:
Source code in techlandscape/expansion.py
def get_full_pc_expansion(
primary_key: PrimaryKey,
tech_class: TechClass,
n_pc: int,
table_ref: str,
destination_table: str,
staging_dataset: str,
credentials: Path,
precomputed: bool = False,
**kwargs,
):
"""Compute (or use precomputed) seed pc odds and expand along the pc dimension.
Arguments:
primary_key: table primary key
tech_class: technological class considered
n_pc: nb most important pc for pc expansion
table_ref: expansion table (project.dataset.table)
destination_table: query results destination table (project.dataset.table)
staging_dataset: intermediary table staging dataset (project.dataset)
credentials: BQ credentials file path
precomputed: True if pc odds pre-computed
**kwargs: key worded args passed to bigquery.QueryJobConfig
**Usage:**
"""
name = table_ref.split(".")[-1].replace("seed_", "")
if not precomputed:
get_seed_pc_freq(
primary_key,
tech_class,
table_ref,
f"{staging_dataset}.{name}_seed_{tech_class.value}_freq",
credentials,
verbose=False,
)
get_universe_pc_freq(
primary_key,
tech_class,
table_ref,
f"{staging_dataset}.{name}_universe_{tech_class.value}_freq",
credentials,
verbose=False,
)
get_seed_pc_odds(
tech_class,
f"{staging_dataset}.{name}_seed_{tech_class.value}_freq",
f"{staging_dataset}.{name}_universe_{tech_class.value}_freq",
f"{staging_dataset}.{name}_seed_{tech_class.value}_odds",
credentials,
verbose=False,
)
get_pc_expansion(
primary_key,
tech_class,
n_pc,
f"{staging_dataset}.{name}_seed_{tech_class.value}_odds",
destination_table,
credentials,
**kwargs,
)
get_pc_expansion(primary_key, tech_class, n_pc, table_ref, destination_table, credentials, **kwargs)
¶
Expand the seed "along" the pc dimension for patent classes with large odds in
Parameters:
Name | Type | Description | Default |
---|---|---|---|
primary_key |
PrimaryKey |
table primary key |
required |
tech_class |
TechClass |
technological class considered |
required |
n_pc |
int |
nb most important pc for pc expansion |
required |
table_ref |
str |
pc odds table (project.dataset.table) |
required |
destination_table |
str |
query results destination table (project.dataset.table) |
required |
credentials |
Path |
BQ credentials file path |
required |
**kwargs |
|
key worded args passed to bigquery.QueryJobConfig |
{} |
Usage:
Source code in techlandscape/expansion.py
def get_pc_expansion(
primary_key: PrimaryKey,
tech_class: TechClass,
n_pc: int,
table_ref: str,
destination_table: str,
credentials: Path,
**kwargs,
):
"""
Expand the seed "along" the pc dimension for patent classes with large odds in
Arguments:
primary_key: table primary key
tech_class: technological class considered
n_pc: nb most important pc for pc expansion
table_ref: pc odds table (project.dataset.table)
destination_table: query results destination table (project.dataset.table)
credentials: BQ credentials file path
**kwargs: key worded args passed to bigquery.QueryJobConfig
**Usage:**
"""
project_id = get_project_id(primary_key, credentials)
query = f"""
WITH
important_pc AS (
SELECT
{tech_class.value}
FROM
`{table_ref}`
ORDER BY
odds DESC
LIMIT
{n_pc} ),
patents AS (
SELECT
{primary_key.value},
{tech_class.value}.code AS {tech_class.value}
FROM
`{project_id}.patents.publications`,
UNNEST({tech_class.value}) AS {tech_class.value} )
SELECT
patents.{primary_key.value},
"PC" as expansion_level
# STRING_AGG(DISTINCT(patents.{tech_class.value})) AS {tech_class.value} ## dbg
FROM
important_pc
LEFT JOIN
patents
ON
important_pc.{tech_class.value}=patents.{tech_class.value}
GROUP BY
{primary_key.value}
"""
get_bq_job_done(query, destination_table, credentials, **kwargs)
get_reduced_expansion(primary_key, table_ref, destination_table, credentials, **kwargs)
¶
Return the table_ref
in a reduced format. Each primary_key
appears only once
primary_key: table primary key
table_ref: expansion table (project.dataset.table)
destination_table: query results destination table (project.dataset.table)
credentials: BQ credentials file path
**kwargs: key worded args passed to bigquery.QueryJobConfig
Usage:
Source code in techlandscape/expansion.py
def get_reduced_expansion(
primary_key: PrimaryKey,
table_ref: str,
destination_table: str,
credentials: Path,
**kwargs,
):
"""
Return the `table_ref` in a reduced format. Each `primary_key` appears only once
primary_key: table primary key
table_ref: expansion table (project.dataset.table)
destination_table: query results destination table (project.dataset.table)
credentials: BQ credentials file path
**kwargs: key worded args passed to bigquery.QueryJobConfig
**Usage:**
"""
query = f"""
SELECT
{primary_key.value},
CASE
WHEN STRING_AGG(expansion_level) LIKE "%ANTISEED%" THEN "ANTISEED"
WHEN STRING_AGG(expansion_level) LIKE "%SEED%" THEN "SEED"
WHEN STRING_AGG(expansion_level) LIKE "%PC%" THEN "PC"
WHEN STRING_AGG(expansion_level) LIKE "%L1-BACK%" THEN "L1-BACK"
WHEN STRING_AGG(expansion_level) LIKE "%L1-FOR%" THEN "L1-FOR"
WHEN STRING_AGG(expansion_level) LIKE "%L2-BACK%" THEN "L2-BACK"
WHEN STRING_AGG(expansion_level) LIKE "%L2-FOR%" THEN "L2-FOR"
ELSE NULL
END AS expansion_level,
STRING_AGG(expansion_level) as expansion_level_,
COUNT(expansion_level) as nb_match,
FROM `{table_ref}`
GROUP BY {primary_key.value}
"""
get_bq_job_done(query, destination_table, credentials, **kwargs)
get_seed(primary_key, table_ref, destination_table, credentials, random_share=None, verbose=False, **kwargs)
¶
Return the starting block of the expansion. Random draw enabled (e.g. for robustness analysis)
Parameters:
Name | Type | Description | Default |
---|---|---|---|
primary_key |
PrimaryKey |
table primary key |
required |
table_ref |
str |
manually annotated seed table (project.dataset.table) |
required |
destination_table |
str |
query results destination table (project.dataset.table) |
required |
credentials |
Path |
BQ credentials file path |
required |
random_share |
float |
size of the random draw (if not None) |
None |
verbose |
bool |
verbosity |
False |
**kwargs |
|
key worded args passed to bigquery.QueryJobConfig |
{} |
Usage:
Source code in techlandscape/expansion.py
def get_seed(
primary_key: PrimaryKey,
table_ref: str,
destination_table: str,
credentials: Path,
random_share: float = None,
verbose: bool = False,
**kwargs,
):
"""
Return the starting block of the expansion. Random draw enabled (e.g. for robustness analysis)
Arguments:
primary_key: table primary key
table_ref: manually annotated seed table (project.dataset.table)
destination_table: query results destination table (project.dataset.table)
credentials: BQ credentials file path
random_share: size of the random draw (if not None)
verbose: verbosity
**kwargs: key worded args passed to bigquery.QueryJobConfig
**Usage:**
"""
random_share_clause = f"""AND RAND()<{random_share}""" if random_share else ""
query = f"""
SELECT
{primary_key.value},
expansion_level
FROM
`{table_ref}` # patentcity.techdiffusion.seed_additivemanufacturing
WHERE
expansion_level LIKE "%SEED%"
{random_share_clause}"""
get_bq_job_done(query, destination_table, credentials, verbose=verbose, **kwargs)
get_seed_pc_freq(primary_key, tech_class, table_ref, destination_table, credentials, **kwargs)
¶
Return the frequency of patent classes in the seed
Parameters:
Name | Type | Description | Default |
---|---|---|---|
primary_key |
PrimaryKey |
table primary key |
required |
tech_class |
TechClass |
technological class considered |
required |
table_ref |
str |
expansion table (project.dataset.table) |
required |
destination_table |
str |
query results destination table (project.dataset.table) |
required |
credentials |
Path |
BQ credentials file path |
required |
**kwargs |
|
key worded args passed to bigquery.QueryJobConfig |
{} |
Usage:
Source code in techlandscape/expansion.py
def get_seed_pc_freq(
primary_key: PrimaryKey,
tech_class: TechClass,
table_ref: str,
destination_table: str,
credentials: Path,
**kwargs,
):
"""
Return the frequency of patent classes in the seed
Arguments:
primary_key: table primary key
tech_class: technological class considered
table_ref: expansion table (project.dataset.table)
destination_table: query results destination table (project.dataset.table)
credentials: BQ credentials file path
**kwargs: key worded args passed to bigquery.QueryJobConfig
**Usage:**
"""
project_id = get_project_id(primary_key, credentials)
query = f"""
WITH tmp AS (
SELECT
tmp.{primary_key.value},
{tech_class.value}.code AS {tech_class.value}
FROM
`{project_id}.patents.publications` AS p,
`{table_ref}` AS tmp,
UNNEST({tech_class.value}) AS {tech_class.value}
WHERE
p.{primary_key.value}=tmp.{primary_key.value}
and tmp.expansion_level = "SEED"
)
,
total AS (
SELECT
COUNT({primary_key.value}) as count
FROM
`{table_ref}`
WHERE expansion_level = "SEED"
)
SELECT
{tech_class.value},
count({tech_class.value}) as n_{tech_class.value},
total.count as n_patents,
count({tech_class.value})/total.count as freq
FROM
tmp,
total
GROUP BY
{tech_class.value},
total.count
ORDER BY
freq DESC
"""
get_bq_job_done(query, destination_table, credentials, **kwargs)
get_seed_pc_odds(tech_class, table_seed, table_universe, destination_table, credentials, **kwargs)
¶
Return the odds of patent classes in seed compared to the universe of patents
Parameters:
Name | Type | Description | Default |
---|---|---|---|
tech_class |
TechClass |
technological class considered |
required |
table_seed |
str |
pc freq seed table (project.dataset.table) |
required |
table_universe |
str |
pc freq universe table (project.dataset.table) |
required |
destination_table |
str |
query results destination table (project.dataset.table) |
required |
credentials |
Path |
BQ credentials file path |
required |
**kwargs |
|
key worded args passed to bigquery.QueryJobConfig |
{} |
Source code in techlandscape/expansion.py
def get_seed_pc_odds(
tech_class: TechClass,
table_seed: str,
table_universe: str,
destination_table: str,
credentials: Path,
**kwargs,
):
"""
Return the odds of patent classes in seed compared to the universe of patents
Arguments:
tech_class: technological class considered
table_seed: pc freq seed table (project.dataset.table)
table_universe: pc freq universe table (project.dataset.table)
destination_table: query results destination table (project.dataset.table)
credentials: BQ credentials file path
**kwargs: key worded args passed to bigquery.QueryJobConfig
"""
query = f"""
WITH tmp AS (
SELECT
seed.{tech_class.value} as {tech_class.value},
seed.freq as seed_freq,
universe.freq as universe_freq
FROM
`{table_seed}` as seed,
`{table_universe}` as universe
WHERE seed.{tech_class.value} = universe.{tech_class.value} and seed.{tech_class.value} IS NOT NULL)
SELECT
{tech_class.value},
seed_freq,
universe_freq,
seed_freq / universe_freq as odds
FROM tmp
ORDER BY
odds DESC
"""
get_bq_job_done(query, destination_table, credentials, **kwargs)
get_universe_pc_freq(primary_key, tech_class, table_ref, destination_table, credentials, **kwargs)
¶
Return the frequency of patent classes in the universe of patents. Nb: we restrict to patents published between p25 and p75 of the publication_year of patents in the seed.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
primary_key |
PrimaryKey |
table primary key |
required |
tech_class |
TechClass |
technological class considered |
required |
table_ref |
str |
expansion table (project.dataset.table) |
required |
destination_table |
str |
query results destination table (project.dataset.table) |
required |
credentials |
Path |
BQ credentials file path |
required |
**kwargs |
|
key worded args passed to bigquery.QueryJobConfig |
{} |
Usage:
Source code in techlandscape/expansion.py
def get_universe_pc_freq(
primary_key: PrimaryKey,
tech_class: TechClass,
table_ref: str,
destination_table: str,
credentials: Path,
**kwargs,
):
"""
Return the frequency of patent classes in the universe of patents. Nb: we restrict to patents published between p25
and p75 of the publication_year of patents in the seed.
Arguments:
primary_key: table primary key
tech_class: technological class considered
table_ref: expansion table (project.dataset.table)
destination_table: query results destination table (project.dataset.table)
credentials: BQ credentials file path
**kwargs: key worded args passed to bigquery.QueryJobConfig
**Usage:**
"""
# TODO unrestricted time span option?
# Support family_id level?
project_id = get_project_id(primary_key, credentials)
query = f"""
WITH
tmp AS (
SELECT
tmp.{primary_key.value},
CAST(p.publication_date/10000 AS INT64) AS publication_year
FROM
{table_ref} AS tmp,
`{project_id}.patents.publications` AS p
WHERE
p.{primary_key.value}=tmp.{primary_key.value}
AND tmp.expansion_level="SEED"),
stats AS (
SELECT
percentiles[OFFSET(25)] AS p25,
percentiles[OFFSET(75)] AS p75
FROM (
SELECT
APPROX_QUANTILES(publication_year, 100) AS percentiles
FROM
tmp)),
total as (SELECT
count({primary_key.value}) as count
FROM
`{project_id}.patents.publications` AS p,
stats
WHERE
publication_date BETWEEN stats.p25*10000
AND stats.p75*10000)
SELECT
{tech_class.value}.code AS {tech_class.value},
COUNT({tech_class.value}.code) as n_cpc,
total.count as n_patents,
COUNT({tech_class.value}.code)/total.count as freq
FROM
`{project_id}.patents.publications` AS p,
UNNEST({tech_class.value}) AS {tech_class.value},
stats,
total
WHERE
publication_date BETWEEN stats.p25*10000
AND stats.p75*10000
GROUP BY
{tech_class.value},
total.count
ORDER BY
freq DESC
"""
get_bq_job_done(query, destination_table, credentials, **kwargs)