candidates
QueryCandidates
¶
Class
get_query(self)
¶
Return candidates query (all)
Source code in techlandscape/candidates.py
def get_query(self) -> str:
"""Return candidates query (all)"""
query_patents = self.get_query_patents(self.patents)
query_cpcs = self.get_query_cpcs(self.cpcs)
query_keywords = self.get_query_keywords(self.keywords)
query = f"""
WITH tmp AS (
{"UNION ALL".join([query_patents, query_cpcs, query_keywords])}
)
SELECT
p.family_id,
STRING_AGG(tmp.publication_number) AS publication_number,
CONCAT(ANY_VALUE(tmp.title), "\\n\\n", ANY_VALUE(tmp.abstract)) AS text,
STRING_AGG(DISTINCT(tmp.match), "," order by tmp.match ASC ) AS match,
ARRAY_LENGTH(SPLIT(STRING_AGG(tmp.match))) AS match_number,
FROM
tmp
LEFT JOIN
`patents-public-data.patents.publications` AS p
ON
tmp.publication_number=p.publication_number
GROUP BY
family_id
"""
return query
get_query_cpcs(self, cpcs)
¶
Return candidates query based on cpcs
Source code in techlandscape/candidates.py
def get_query_cpcs(self, cpcs: List[str]) -> str:
"""Return candidates query based on cpcs"""
cpcs = cpcs if cpcs else self.cpcs
query = f"""
SELECT
publication_number,
title,
abstract,
"cpc" AS match
FROM
`patents-public-data.google_patents_research.publications` AS gpr,
UNNEST(cpc) as cpc
WHERE {" OR ".join(map(lambda x: 'cpc.code LIKE "' + x + '%"', cpcs))}
"""
return query
get_query_keywords(self, keywords)
¶
Return candidates query based on keywords
Source code in techlandscape/candidates.py
def get_query_keywords(self, keywords: List[str]) -> str:
"""Return candidates query based on keywords"""
keywords = keywords if keywords else self.keywords
query = f"""
SELECT
publication_number,
title,
abstract,
"keyword" AS match
FROM
`patents-public-data.google_patents_research.publications` AS gpr
WHERE {" OR ".join(map(lambda x: 'LOWER(gpr.abstract) LIKE "%' + x + '%"', keywords))}
"""
return query
get_query_patents(self, patents)
¶
Return candidates query based on patent similarity
Source code in techlandscape/candidates.py
def get_query_patents(self, patents: List[str]) -> str:
"""Return candidates query based on patent similarity"""
patents = patents if patents else self.patents
query = f"""
WITH SIMILAR AS (
SELECT
similar.publication_number,
"patent" AS match
FROM
`patents-public-data.google_patents_research.publications` AS gpr,
UNNEST(similar) AS similar
WHERE
gpr.publication_number IN ({",".join(map(lambda x: '"' + x + '"', patents))})
)
SELECT
SIMILAR.publication_number,
title,
abstract,
SIMILAR.match
FROM
`patents-public-data.google_patents_research.publications` AS gpr,
SIMILAR
WHERE
SIMILAR.publication_number=gpr.publication_number
AND SIMILAR.publication_number IS NOT NULL
"""
return query
get_candidates(config, destination_table, credentials, verbose=False)
¶
Return seed candidates based on config
. Candidate table is saved to destination_table
Source code in techlandscape/candidates.py
@app.command()
def get_candidates(
config: Path, destination_table: str, credentials: Path, verbose: bool = False
):
"""
Return seed candidates based on `config`. Candidate table is saved to `destination_table`
"""
query = QueryCandidates(config).get_query()
get_bq_job_done(query, destination_table, credentials, verbose=verbose)
get_candidates_sample(table_ref, credentials, destination_table=None, size_bin=500, verbose=False)
¶
Return sample of seed candidates. If no destination_table
, output saved to table_ref
(overwrite)
Source code in techlandscape/candidates.py
@app.command()
def get_candidates_sample(
table_ref: str,
credentials: Path,
destination_table: str = None,
size_bin: int = 500,
verbose: bool = False,
):
"""Return sample of seed candidates. If no `destination_table`, output saved to `table_ref`(overwrite)"""
query = f"""
WITH
table_stats AS (
SELECT
*,
SUM(nb_bin) OVER() AS nb_total
FROM (
SELECT
match,
COUNT(match) AS nb_bin
FROM
`{table_ref}`
GROUP BY
match) )
SELECT
*
FROM
`{table_ref}`
JOIN
table_stats
USING
(match)
WHERE
RAND()<{size_bin}/nb_bin
ORDER BY
RAND()"""
destination_table = destination_table if destination_table else table_ref
get_bq_job_done(query, destination_table, credentials, verbose=verbose)
prep_prodigy_annotation(data, config)
¶
Return data
with options for prodigy annotation to stdout.
Nb: data is expected to be a jsonl file.
Source code in techlandscape/candidates.py
@app.command()
def prep_prodigy_annotation(data: Path, config: Path):
"""Return `data` with options for prodigy annotation to stdout.
Nb: data is expected to be a jsonl file."""
def add_prodigy_options(line: dict, options: List[str]):
line.update(
{"options": [{"id": i, "text": option} for i, option in enumerate(options)]}
)
return line
config = get_config(config)
options = config.get("option")
for line in Path(data).open("r"):
line = json.loads(line)
if options:
line = add_prodigy_options(line, options)
typer.echo(json.dumps(line))