Skip to content

io

IO

add_cpc_codes(src_table, destination_table, credentials)

Parameters:

Name Type Description Default
src_table str

source table (project.dataset.table)

required
destination_table str

destination table (project.dataset.table)

required
credentials str

BQ credentials file path

required

Usage:

patentcity io add-cpc-codes <src_table> <destination_table> credentials-patentcity.json

Source code in patentcity/io.py
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
@app.command()
def add_cpc_codes(src_table: str, destination_table: str, credentials: str):
    """

    Args:
        src_table: source table (project.dataset.table)
        destination_table: destination table (project.dataset.table)
        credentials: BQ credentials file path

    **Usage:**
        ```shell
        patentcity io add-cpc-codes <src_table> <destination_table> credentials-patentcity.json
        ```
    """
    query = f"""
    WITH tmp AS(
    SELECT
    publication_number,
    STRING_AGG(cpc.code) AS cpc_code
    FROM
    `patents-public-data.patents.publications`,
    UNNEST(cpc)
    AS
    cpc
    GROUP BY
    publication_number)

    SELECT
    pc. *, tmp.cpc_code
    FROM
    `{src_table}` # patentcity.patentcity.v100rc7
    AS pc
    LEFT JOIN tmp
    ON pc.publication_number = tmp.publication_number"""
    _get_job_done(query, destination_table, credentials)

augment_patentcity(src_table, destination_table, credentials=None)

Add (mainly interoperability) variables to src_table and save to `destination_table

Parameters:

Name Type Description Default
src_table str

source table (project.dataset.table)

required
destination_table str

destination table (project.dataset.table)

required
credentials str

BQ credentials file path

None

Usage:

patentcity io augment-patentcity <src_table> <destination_table> credentials-patentcity.json

Source code in patentcity/io.py
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
@app.command()
def augment_patentcity(
    src_table: str, destination_table: str, credentials: str = None
) -> None:
    """Add (mainly interoperability) variables to `src_table` and save to `destination_table

    Arguments:
        src_table: source table (project.dataset.table)
        destination_table: destination table (project.dataset.table)
        credentials: BQ credentials file path

    **Usage:**
        ```shell
        patentcity io augment-patentcity <src_table> <destination_table> credentials-patentcity.json
        ```

    """
    query = f"""
    SELECT
      pc.publication_number,
      p.publication_date,
      p.family_id,
      SPLIT(pc.publication_number, "-")[OFFSET(0)] AS country_code,
      SPLIT(pc.publication_number, "-")[OFFSET(1)] AS pubnum,
      SPLIT(pc.publication_number, "-")[OFFSET(2)] AS kind_code,
      pc.* EXCEPT(publication_number)
    FROM
      `patents-public-data.patents.publications` AS p
    RIGHT JOIN
      `{src_table}` AS pc
    ON
      pc.publication_number = p.publication_number
    """
    _get_job_done(query, destination_table, credentials)

build_wgp_as_patentcity(addresses_table, patentee_location_table, patstat_patent_properties_table=None, tls206_table=None, tls207_table=None, destination_table=None, flavor=None, credentials=None)

Join addresses and individuals from WGP and add data at the patent as well as individual level.

Parameters:

Name Type Description Default
addresses_table str

WGP addresses table (project.dataset.table)

required
patentee_location_table str

WGP patentees table (project.dataset.table)

required
patstat_patent_properties_table str

PATSTAT patent properties table on BQ (project.dataset.table)

None
tls206_table str

PATSTAT tls206 table on BQ (project.dataset.table)

None
tls207_table str

PATSTAT tls207 table on BQ (project.dataset.table)

None
destination_table str

destination table (project.dataset.table)

None
flavor int

WGP source data flavor (in [25, 45])

None
credentials str

BQ credentials file path

None

Usage:

patentcity io build-wgp-as-patentcity patentcity.external.addresses_florian45_patentcity patentcity.external.person_location_id --tls206-table patentcity.external.tls206 --tls207-table patentcity.external.tls207 --patstat-patent-properties-table patentcity.external.patstat_patent_properties --destination-table patentcity.tmp.patentcity45 --flavor 45 --key-file $KEY_FILE

Source code in patentcity/io.py
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
@app.command()
def build_wgp_as_patentcity(
    addresses_table: str,
    patentee_location_table: str,
    patstat_patent_properties_table: str = None,
    tls206_table: str = None,
    tls207_table: str = None,
    destination_table: str = None,
    flavor: int = None,
    credentials: str = None,
) -> None:
    """Join addresses and individuals from WGP and add data at the patent as well as individual level.

    Arguments:
        addresses_table: WGP addresses table (project.dataset.table)
        patentee_location_table: WGP patentees table (project.dataset.table)
        patstat_patent_properties_table: PATSTAT patent properties table on BQ (project.dataset.table)
        tls206_table: PATSTAT tls206 table on BQ (project.dataset.table)
        tls207_table: PATSTAT tls207 table on BQ (project.dataset.table)
        destination_table: destination table (project.dataset.table)
        flavor: WGP source data flavor (in [25, 45])
        credentials: BQ credentials file path

    **Usage:**
        ```shell
        patentcity io build-wgp-as-patentcity patentcity.external.addresses_florian45_patentcity patentcity.external.person_location_id --tls206-table patentcity.external.tls206 --tls207-table patentcity.external.tls207 --patstat-patent-properties-table patentcity.external.patstat_patent_properties --destination-table patentcity.tmp.patentcity45 --flavor 45 --key-file $KEY_FILE
        ```
    """
    assert flavor in [25, 45]
    assert patentee_location_table
    assert addresses_table
    if flavor == 25:
        query = f"""

        WITH
          tmp AS (
          SELECT
            patee.* EXCEPT(recId),
            loc.*,
            app_inv="INV" AS is_inv,
            app_inv="APP" AS is_app
          FROM (
            SELECT
              *
            FROM
              `{addresses_table}`  # patentcity.external.addresses_cyril25_patentcity
            WHERE
              seqNumber = 1
              AND (matchLevel="NOMATCH" AND source="HERE") IS FALSE ) AS loc
          JOIN
            `{patentee_location_table}` AS patee
            # patentcity.external.inventor_applicant_recid
          ON
            loc.recId = patee.recId )  # location_id
        SELECT
          tmp.* EXCEPT(appln_id, pat_publn_id),
          patstat.*,
          SPLIT(patstat.publication_number, "-")[OFFSET(0)] AS country_code,
          SPLIT(patstat.publication_number, "-")[OFFSET(1)] AS pubnum,
          SPLIT(patstat.publication_number, "-")[OFFSET(2)] AS kind_code
        FROM
          tmp
        LEFT JOIN
          `{patstat_patent_properties_table}` AS patstat
        ON
          tmp.pat_publn_id = patstat.pat_publn_id #tmp.appln_id = patstat.appln_id
          # here we are at the publication level, not the patent level
        WHERE
          SPLIT(patstat.publication_number, "-")[OFFSET(0)] IN ("DE", "GB", "FR", "US")

        """
    if flavor == 45:
        assert tls206_table
        assert tls207_table
        query = f"""
            WITH
              tmp AS (
              WITH
                tmp_ AS (
                WITH
                  person AS (
                  SELECT
                    tls207.*,
                    tls206.person_name,
                    invt_seq_nr > 0 AS is_inv,
                    applt_seq_nr > 0 AS is_asg
                  FROM
                    `{tls206_table}` AS tls206,  # usptobias.patstat.tls206
                    `{tls207_table}` AS tls207  # usptobias.patstat.tls207
                  WHERE
                    tls207.person_id=tls206.person_id )
                SELECT
                  patee.*,
                  person.* EXCEPT(person_id)
                FROM
                  `{patentee_location_table}` AS patee  # patentcity.external.person_location_id
                LEFT JOIN
                  person
                ON
                  patee.person_id = person.person_id)
              SELECT
                *
              FROM
                tmp_
              LEFT JOIN
                `{addresses_table}` AS loc  # patentcity.external.addresses_florian45_patentcity
              ON
                tmp_.location_id = loc.recId
              WHERE
                seqNumber = 1   )
            SELECT
              tmp.* EXCEPT(appln_id),
              patstat.*,
              SPLIT(patstat.publication_number, "-")[OFFSET(0)] AS country_code,
              SPLIT(patstat.publication_number, "-")[OFFSET(1)] AS pubnum,
              SPLIT(patstat.publication_number, "-")[OFFSET(2)] AS kind_code
            FROM
              tmp
            LEFT JOIN
              `{patstat_patent_properties_table}` AS patstat
            ON
              tmp.appln_id = patstat.appln_id
              # here we are at the the patent level
            WHERE
              SPLIT(patstat.publication_number, "-")[OFFSET(0)] IN ("DE", "GB", "FR", "US")
            """
    _get_job_done(query, destination_table, credentials)

deduplicate(src_table, destination_table, credentials)

Deduplicate patentcity table from publications which are both in at least 2 of the following data sources PC, WGP45 and WGP25. We prioritize PC, then WGP45 and then WGP25.

Argument

src_table: source table (project.dataset.table) destination_table: destination table (project.dataset.table) credentials: BQ credentials file path

Usage:

patentcity io deduplicate <src-table> <destination-table> credentials-patentcity.json

Source code in patentcity/io.py
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
@app.command()
def deduplicate(src_table: str, destination_table: str, credentials: str) -> None:
    """
    Deduplicate patentcity table from publications which are both in at least 2 of the following data sources PC, WGP45
    and WGP25. We prioritize PC, then WGP45 and then WGP25.

    Argument:
        src_table: source table (project.dataset.table)
        destination_table: destination table (project.dataset.table)
        credentials: BQ credentials file path

    **Usage:**
        ```shell
        patentcity io deduplicate <src-table> <destination-table> credentials-patentcity.json
        ```
    """
    query = f"""
    WITH
      duplicates AS (
      SELECT
        publication_number,
        COUNT(publication_number) AS nb_occ,
        STRING_AGG(DISTINCT(origin)) AS origins
      FROM
        {src_table} #`patentcity.tmp.v100rc5`
      GROUP BY
        publication_number),
      keep_list AS (
      SELECT
        tmp.publication_number,
        tmp.origin,
        duplicates.* EXCEPT(publication_number),
        CASE
          WHEN nb_occ = 1 THEN TRUE
          WHEN nb_occ > 1 AND origins LIKE "%PC%" AND origin="PC" THEN TRUE
          WHEN nb_occ > 1 AND origins LIKE "%WGP45%" AND origins NOT LIKE "%PC%" AND origin="WGP45" THEN TRUE
          WHEN nb_occ > 1 AND origins LIKE "%WGP25%" AND origins NOT LIKE "%PC%" AND origins NOT LIKE "%WGP45%" AND origin="WGP25" THEN TRUE
        ELSE FALSE
      END
        AS keep
      FROM
        {src_table} AS tmp  # `patentcity.tmp.v100rc5`
      LEFT JOIN
        duplicates
      ON
        tmp.publication_number = duplicates.publication_number )
    SELECT
      tmp.*#,
      #keep_list.* EXCEPT(publication_number, origin) ## for dbg
    FROM
      {src_table} AS tmp  # `patentcity.tmp.v100rc5`
    LEFT JOIN
      keep_list
    ON
      tmp.publication_number=keep_list.publication_number
      AND tmp.origin=keep_list.origin
    WHERE keep IS TRUE  """
    _get_job_done(query, destination_table, credentials)

extract_sample_kepler(src_table, dest_file, sample_ratio=0.1, office=None, credentials=None)

Extract sample for kepler.gl

Parameters:

Name Type Description Default
src_table str

source table (project.dataset.table)

required
dest_file str

destination file path (local)

required
sample_ratio float

share of patents to extract

0.1
office str

patent office two letter-code (e.g. DD, DE, FR, etc)

None
credentials str

BQ credentials file path

None

Usage:

patentcity io extract-sample-kepler <src_table> <dest_file> --office DE --credentials credentials-patentcity.json

Source code in patentcity/io.py
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
@app.command()
def extract_sample_kepler(
    src_table: str,
    dest_file: str,
    sample_ratio: float = 0.1,
    office: str = None,
    credentials: str = None,
) -> None:
    """Extract sample for kepler.gl

    Arguments:
        src_table: source table (project.dataset.table)
        dest_file: destination file path (local)
        sample_ratio: share of patents to extract
        office: patent office two letter-code (e.g. DD, DE, FR, etc)
        credentials: BQ credentials file path

    **Usage:**
        ```shell
        patentcity io extract-sample-kepler <src_table> <dest_file> --office DE --credentials credentials-patentcity.json
        ```
    """
    office_clause = f"""AND country_code="{office}" """ if office else ""
    query = f"""
    SELECT
      publication_number,
      country_code,
      CAST(publication_date/10000 AS INT64) AS publication_year,
      PARSE_TIMESTAMP('%Y%m%d%H%M%S', CAST(publication_date*100000 AS STRING)) as publication_date,
      patentee.loc_country as country,
      patentee.loc_city as city,
      patentee.loc_latitude as point_latitude,
      patentee.loc_longitude as point_longitude
    FROM
      {src_table},
      UNNEST(patentee) AS patentee
    WHERE
      RAND()<{sample_ratio}
      AND publication_date>0
      AND patentee.loc_source IS NOT NULL
      AND patentee.loc_latitude IS NOT NULL
      {office_clause}
    """
    client = _get_bq_client(credentials)
    typer.secho(f"Start:\n{query}", fg=typer.colors.BLUE)
    df = client.query(query).to_dataframe()
    df.to_csv(dest_file, index=False)
    typer.secho(f"{ok}Extract for Kepler saved to {dest_file}.", fg=typer.colors.GREEN)

family_expansion(src_table, destination_table, credentials, destination_schema)

Expand along families in table ref. The returned table contains all publications belonging to a family existing in src_table but absent from the latter. Family data are assigned from data in src_table.

Parameters:

Name Type Description Default
src_table str

source table (project.dataset.table)

required
destination_table str

destination table (project.dataset.table)

required
credentials str

BQ credentials file path

required
destination_schema str

destination schema file path

required

Usage:

patentcity io family-expansion <src-table> <destination-table>  credentials-patentcity.json schema/patentcity_v1.json

Source code in patentcity/io.py
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
@app.command()
def family_expansion(
    src_table: str, destination_table: str, credentials: str, destination_schema: str
) -> None:
    """Expand along families in `table ref`. The returned table contains all publications belonging to a family
    existing in `src_table` *but* absent from the latter. Family data are *assigned* from data in `src_table`.

    Arguments:
        src_table: source table (project.dataset.table)
        destination_table: destination table (project.dataset.table)
        credentials: BQ credentials file path
        destination_schema: destination schema file path

    **Usage:**
        ```shell
        patentcity io family-expansion <src-table> <destination-table>  credentials-patentcity.json schema/patentcity_v1.json
        ```
    """
    query = f"""
    WITH
      family_table AS (
      SELECT
        family_id,
        ANY_VALUE(patentee) as patentee
      FROM
        `{src_table}`  # patentcity.patentcity.v100rc4
     GROUP BY
      family_id   ),
      publication_list AS (
      SELECT
        DISTINCT(publication_number) AS publication_number
      FROM
        `{src_table}`),  # patentcity.patentcity.v100rc4
      expanded_family_table AS (
      SELECT
        p.publication_number,
        p.publication_date,
        family_table.*
      FROM
        `patents-public-data.patents.publications`AS p,
        family_table
      WHERE
        p.family_id = family_table.family_id
        AND family_table.family_id IS NOT NULL
        AND SPLIT(p.publication_number, "-")[OFFSET(0)] in ("DD","DE", "FR", "GB", "US"))#,

    SELECT
    expanded_family_table.*, #EXCEPT(appln_id, pat_publn_id, docdb_family_id, inpadoc_family_id),
    SPLIT(expanded_family_table.publication_number, "-")[OFFSET(0)] as country_code,
    SPLIT(expanded_family_table.publication_number, "-")[OFFSET(1)] as pubnum,
    SPLIT(expanded_family_table.publication_number, "-")[OFFSET(2)] as kind_code,
    "EXP" AS origin
    FROM
    publication_list
    RIGHT JOIN
    expanded_family_table
    ON
    expanded_family_table.publication_number=publication_list.publication_number
    WHERE publication_list.publication_number IS NULL
  """
    _get_job_done(
        query, destination_table, credentials, destination_schema=destination_schema
    )

filter_granted_firstpub(src_table, destination_table, credentials)

Filter src_table so as to keep only first publication of granted patents

Parameters:

Name Type Description Default
src_table str

source table (project.dataset.table)

required
destination_table str

destination table (project.dataset.table)

required
credentials str

BQ credentials file path

required

Usage:

patentcity io filter-granted <src_table> <destination_table> credentials-patentcity.json

Source code in patentcity/io.py
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
@app.command()
def filter_granted_firstpub(
    src_table: str, destination_table: str, credentials: str
) -> None:
    """
    Filter `src_table` so as to keep only first publication of granted patents

    Arguments:
        src_table: source table (project.dataset.table)
        destination_table: destination table (project.dataset.table)
        credentials: BQ credentials file path

    **Usage:**
        ```shell
        patentcity io filter-granted <src_table> <destination_table> credentials-patentcity.json
        ```
    """
    query = f"""
    WITH
      tmp AS (
      SELECT
        pubnum,
        country_code,
        STRING_AGG(kind_code) AS kind_codes,
        COUNT(pubnum) AS N,
        STRING_AGG(kind_code) LIKE "%A1,%" OR STRING_AGG(kind_code) LIKE "%,A1" OR STRING_AGG(kind_code)="A1" AS has_A,
        STRING_AGG(kind_code) LIKE "%B,%" OR STRING_AGG(kind_code) LIKE "%,B" OR STRING_AGG(kind_code)="B" AS has_B
      FROM
        `{src_table}`
      GROUP BY
        pubnum,
        country_code )
    SELECT
      pc.*,
      tmp.* EXCEPT(pubnum, country_code)
    FROM
      `{src_table}` AS pc,
      tmp
    WHERE
      pc.country_code=tmp.country_code
      AND pc.pubnum = tmp.pubnum
      AND (
        pc.country_code="DD" ## clause DD

        OR (pc.country_code= "DE" ## clause DE
          AND ( (kind_code="C" OR kind_code = "B")
            OR (kind_code="C2" AND N>1 AND has_A=FALSE)
            OR (kind_code = "A1" AND N > 1)
            OR (N = 1 AND has_a = FALSE) ) )

        OR (pc.country_code= "FR" ## clause FR
          AND ( (kind_code = "A")
            OR (kind_code = "A1" AND N > 1)
            OR (kind_code = "A1" AND N = 1 AND publication_date < 19710000)
            OR (kind_code = "A5") ) )

        OR (pc.country_code="GB" ## clause GB
          AND ( (kind_code = "A" AND N > 1)
            OR (kind_code = "A" AND publication_date < 19500000)
            OR (kind_code = "A" AND N = 1 AND CAST(pc.pubnum AS INT64) < 2000000) ) )

        OR pc.country_code = "US" ## clause US
        )
    """
    _get_job_done(query, destination_table, credentials)

filter_kind_codes(src_table, destination_table, credentials)

Filter src_table to make sure that only utility patents are reported.

Parameters:

Name Type Description Default
src_table str

source table (project.dataset.table)

required
destination_table str

destination table (project.dataset.table)

required
credentials str

BQ credentials file path

required

Usage:

patentcity io filter-kind-codes <src_table> <destination_table> credentials-patentcity.json

Source code in patentcity/io.py
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
@app.command()
def filter_kind_codes(src_table: str, destination_table: str, credentials: str) -> None:
    """Filter `src_table` to make sure that only *utility patents* are reported.

    Arguments:
        src_table: source table (project.dataset.table)
        destination_table: destination table (project.dataset.table)
        credentials: BQ credentials file path

    **Usage:**
        ```shell
        patentcity io filter-kind-codes <src_table> <destination_table> credentials-patentcity.json
        ```
    """
    query = f"""
    WITH keep_list AS (
    SELECT
      publication_number,
      CASE
        WHEN country_code = "DD" AND (kind_code in ("A", "A1", "A3", "B")) THEN TRUE
        WHEN country_code = "DE" AND (kind_code in ("A1", "B", "B1", "B2", "B3", "B4", "C","C1", "C2", "C3")) THEN TRUE
        WHEN country_code = "FR" AND (kind_code in ("A", "A1","A5", "B1", "B3", "C3")) THEN TRUE
        WHEN country_code = "GB" AND (kind_code in ("A", "B")) THEN TRUE
        WHEN country_code = "US" AND (kind_code in ("A", "B1", "B2")) THEN TRUE
        ELSE FALSE
      END AS keep
    FROM
      `{src_table}`) # patentcity.patentcity.v100rc4
    SELECT
      origin.* FROM
      `{src_table}` as origin,  # patentcity.patentcity.v100rc4
      keep_list
      WHERE
        keep_list.publication_number = origin.publication_number
        AND keep_list.keep IS TRUE
    """
    _get_job_done(query, destination_table, credentials)

get_stratified_sample(src_table, bin_size=50, preview=False, destination_table=None, credentials=None)

Return a stratified sample of src_table (based on country_code and publication_decade) with bin_size samples in each bin (if possible).

Parameters:

Name Type Description Default
src_table str

source table (project.dataset.table)

required
bin_size int

bin size

50
preview bool

if True, output not saved and table stats to stdout. Else, output saved to destination_table

False
destination_table str

destination table (project.dataset.table)

None
credentials str

BQ credentials file path

None

Usage:

patentcity io get-stratified-sample patentcity.patentcity.v1

Source code in patentcity/io.py
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
@app.command()
def get_stratified_sample(
    src_table: str,
    bin_size: int = 50,
    preview: bool = False,
    destination_table: str = None,
    credentials: str = None,
) -> None:
    """Return a stratified sample of `src_table` (based on country_code and publication_decade) with `bin_size` samples
    in each bin (if possible).

    Arguments:
        src_table: source table (project.dataset.table)
        bin_size: bin size
        preview: if True, output not saved and table stats to stdout. Else, output saved to `destination_table`
        destination_table: destination table (project.dataset.table)
        credentials: BQ credentials file path

    **Usage:**
        ```shell
        patentcity io get-stratified-sample patentcity.patentcity.v1
        ```

    !!! tip
        [Stratified random sampling with bigquery - StackOverflow](https://stackoverflow.com/questions/52901451/stratified-random-sampling-with-bigquery)

    """
    if preview:
        prefix = """
        SELECT COUNT(*) nb_samples, country_code, publication_decade, ROUND(100*COUNT(*)/MAX(nb_bin),2) AS percentage
        FROM (
        """
        select = (
            """SELECT publication_number, publication_decade, country_code, nb_bin"""
        )
        suffix = """) GROUP BY country_code, publication_decade"""
    else:
        prefix, select, suffix = "", "SELECT * ", ""

    query = f"""
    WITH tmp AS (
      SELECT CAST(publication_date/100000 AS INT64) AS publication_decade,
      * EXCEPT(patentee)
      FROM `{src_table}`,  # patentcity.patentcity.wgp_v1
            UNNEST(patentee) as patentee
        WHERE
        patentee.loc_text IS NOT NULL
        AND patentee.loc_source IS NOT NULL ),
      table_stats AS (
  SELECT *, SUM(nb_bin) OVER() AS nb_total
      FROM (
        SELECT
            country_code,
            CAST(publication_date/100000 AS INT64) AS publication_decade,
            COUNT(*) nb_bin
        FROM tmp
        GROUP BY country_code, publication_decade)
    )
    {prefix}
      {select}
      FROM tmp
      JOIN table_stats
      USING(country_code, publication_decade)
      WHERE RAND()< {bin_size}/nb_bin
    {suffix}
    """
    if preview:
        client = _get_bq_client(credentials)
        tmp = (
            client.query(query)
            .to_dataframe()
            .sort_values(by=["country_code", "publication_decade"])
        )
        typer.echo(tmp.to_markdown(index=False))
        typer.secho(f"Nb samples: {tmp['nb_samples'].sum()}", fg=typer.colors.BLUE)
    else:
        _get_job_done(query, destination_table, credentials)

get_wgp25_recid(country_code, src_table, patstat_patent_properties_table, destination_table, credentials)

Extract recId and searchText from wgp25 for patents published in country_code.

Parameters:

Name Type Description Default
country_code str

country code of the patent office (e.g. DE, FR, GB, US, etc)

required
src_table str

source table (project.dataset.table)

required
patstat_patent_properties_table str

PATSTAT patent properties table on BQ (project.dataset.table)

required
destination_table str

destination table (project.dataset.table)

required
credentials str

BQ credentials file path

required

Usage:

OFFICE=DE
patentcity io get-wgp25-recid $OFFICE patentcity.external.inventor_applicant_recid patentcity.tmp.loc_${(L)OFFICE}patentwgp25 credentials-patentcity.json

Info

This function assumes that the recId has been added to inventor_applicant_locationid beforehand (using utils.get_recid(address_)).

Source code in patentcity/io.py
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
@app.command()
def get_wgp25_recid(
    country_code: str,
    src_table: str,
    patstat_patent_properties_table: str,
    destination_table: str,
    credentials: str,
) -> None:
    """Extract recId and searchText from wgp25 for patents published in `country_code`.

    Arguments:
        country_code: country code of the patent office (e.g. DE, FR, GB, US, etc)
        src_table: source table (project.dataset.table)
        patstat_patent_properties_table: PATSTAT patent properties table on BQ (project.dataset.table)
        destination_table: destination table (project.dataset.table)
        credentials: BQ credentials file path

    **Usage:**
        ```shell
        OFFICE=DE
        patentcity io get-wgp25-recid $OFFICE patentcity.external.inventor_applicant_recid patentcity.tmp.loc_${(L)OFFICE}patentwgp25 credentials-patentcity.json
        ```

    !!! info
        This function assumes that the recId has been added to inventor_applicant_locationid beforehand (using `utils.get_recid(address_)`).
    """
    assert len(country_code) == 2
    query = f"""
    WITH
      tmp AS (
      SELECT
        loc.*,
        patstat.*,
        SPLIT(patstat.publication_number, "-")[OFFSET(0)] AS country_code
      FROM
        `{src_table}` AS loc,  # patentcity.external.inventor_applicant_recid
        `{patstat_patent_properties_table}` AS patstat  # patentcity.external.patstat_patent_properties
      WHERE
        loc.appln_id = patstat.appln_id
        AND loc.appln_id IS NOT NULL
        AND SPLIT(patstat.publication_number, "-")[OFFSET(0)] IN "{country_code}"
    SELECT
      recId,
      ANY_VALUE(address_) AS searchText
    FROM
      tmp
    GROUP BY
      recId
    """
    _get_job_done(query, destination_table, credentials)

impute_publication_date(src_table, imputation_table, country_code=None, credentials=None)

Update src_table publication_date - DE & DD only

Parameters:

Name Type Description Default
src_table str

source table (project.dataset.table)

required
imputation_table str

imputation table (project.dataset.table)

required
country_code str

in ["DE", "DD"]

None
credentials str

BQ credentials file path

None

Usage:

patentcity io impute-publication-date <src_table> <imputation_table> --country-code DE --credentials credentials-patentcity.json

Source code in patentcity/io.py
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
@app.command()
def impute_publication_date(
    src_table: str,
    imputation_table: str,
    country_code: str = None,
    credentials: str = None,
) -> None:
    """Update `src_table` publication_date - DE & DD only

    Arguments:
        src_table: source table (project.dataset.table)
        imputation_table: imputation table (project.dataset.table)
        country_code: in ["DE", "DD"]
        credentials: BQ credentials file path

    **Usage:**
        ```shell
        patentcity io impute-publication-date <src_table> <imputation_table> --country-code DE --credentials credentials-patentcity.json
        ```
    """
    de_clause = (
        """AND CAST(imputation.pubnum AS INT64)<330000"""
        if country_code == "DE"
        else """"""
    )
    query = f"""UPDATE
      `{src_table}` AS t
    SET
      t.publication_date = imputation.publication_date
    FROM
      `{imputation_table}` AS imputation
    WHERE
      t.pubnum = imputation.pubnum
      AND country_code="{country_code}"
      {de_clause}

    """
    client = _get_bq_client(credentials)
    typer.secho(f"Start:\n{query}", fg=typer.colors.BLUE)
    client.query(query).result()

    typer.secho(f"{ok}{src_table} updated.", fg=typer.colors.GREEN)

order(src_table, by=None, destination_table=None, credentials=None)

Order src_table by by and stage it onto destination_table

Parameters:

Name Type Description Default
src_table str

source table (project.dataset.table)

required
by str

ordering dimension (e.g. publication_number)

None
destination_table str

destination table (project.dataset.table)

None
credentials str

BQ credentials file path

None

Usage:

patentcity io order patentcity.tmp.patentcity25 --by publication_number --destination-table patentcity.tmp.tmp25 --key-file credentials-patentcity.json

Source code in patentcity/io.py
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
@app.command()
def order(
    src_table: str,
    by: str = None,
    destination_table: str = None,
    credentials: str = None,
) -> None:
    """Order `src_table` by `by` and stage it onto `destination_table`

    Arguments:
        src_table: source table (project.dataset.table)
        by: ordering dimension (e.g. publication_number)
        destination_table: destination table (project.dataset.table)
        credentials: BQ credentials file path

    **Usage:**
        ```shell
        patentcity io order patentcity.tmp.patentcity25 --by publication_number --destination-table patentcity.tmp.tmp25 --key-file credentials-patentcity.json
        ```
    """
    query = f"""
    SELECT
      *
    FROM
      `{src_table}`
    ORDER BY
      {by}  # publication_number
    """
    _get_job_done(query, destination_table, credentials)

prep_csv_extract(src_table, destination_table, credentials)

Return the patentcity table as an unstructured table for CSV extract

Parameters:

Name Type Description Default
src_table str

source table (project.dataset.table)

required
destination_table str

destination table (project.dataset.table)

required
credentials str

BQ credentials file path

required

Usage:

RELEASE="v100rc5"
patentcity io prep-csv-extract patentcity.patentcity.${RELEASE} patentcity.stage.${RELEASE} <your-credentials.json>

Source code in patentcity/io.py
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
@app.command()
def prep_csv_extract(src_table: str, destination_table: str, credentials: str) -> None:
    """
    Return the patentcity table as an unstructured table for CSV extract

    Arguments:
        src_table: source table (project.dataset.table)
        destination_table: destination table (project.dataset.table)
        credentials: BQ credentials file path

    **Usage**:
        ```shell
        RELEASE="v100rc5"
        patentcity io prep-csv-extract patentcity.patentcity.${RELEASE} patentcity.stage.${RELEASE} <your-credentials.json>
        ```
    """
    query = f"""
    SELECT
      p.* EXCEPT(patentee),
      patentee.*
    FROM
      `{src_table}` as p,
      UNNEST(patentee) AS patentee
    """
    _get_job_done(query, destination_table, credentials)