Skip to content

geo

                        Patentcity geo

general principle: address (str) -> structured geo data (dict) 3 flavors: libpostal, HERE, GMAPS

libpostal (parser)

Libpostal https://github.com/openvenues/libpostal Docker libpostal https://github.com/johnlonganecker/libpostal-rest-docker REST api https://github.com/johnlonganecker/libpostal-rest

Note: i) if set up on GCP, you need to set up firewall rules to authorize access from the requesting machine ii) get external IP of GCP compute engine https://console.cloud.google.com/networking/addresses/list?project=

HERE Batch (geocoding)

Guide: developer.here.com/documentation/batch-geocoder/dev_guide/topics/request-constructing.html API ref: https://developer.here.com/documentation/batch-geocoder/dev_guide/topics/endpoints.html

Gmaps (geocoding)

API ref

  • https://developers.google.com/maps/documentation/geocoding/start
  • https://developers.google.com/maps/documentation/geocoding/overview

add_geoc_data(file, geoc_file, source=None, max_workers=5, verbose=False)

Add geoc data from geoc_fileto file

Parameters:

Name Type Description Default
file str

file path

required
geoc_file str

geoc file path (geocoding output, csv)

required
source str

geocoding service (in ["HERE", "GMAPS", "MANUAL"])

None
max_workers int

max number of workers

5
verbose bool

verbosity

False

Usage:

patentcity geo add entrel_uspatentxx.jsonl geoc_uspatentxx.here.csv --source HERE

Source code in patentcity/geo.py
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
@app.command(name="add")
def add_geoc_data(
    file: str,
    geoc_file: str,
    source: str = None,
    max_workers: int = 5,
    verbose: bool = False,
):
    """Add geoc data from `geoc_file`to `file`

    Arguments:
        file: file path
        geoc_file: geoc file path (geocoding output, csv)
        source: geocoding service (in ["HERE", "GMAPS", "MANUAL"])
        max_workers: max number of workers
        verbose: verbosity

    **Usage:**
        ```shell
        patentcity geo add entrel_uspatentxx.jsonl geoc_uspatentxx.here.csv --source HERE
        ```
    """
    assert source in ["GMAPS", "HERE", "MANUAL"]
    index = _get_geoc_index(geoc_file, dump=False)
    blobs = open(file, "r")
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        executor.map(_update_loc, blobs, repeat(source), repeat(index), repeat(verbose))

add_geoc_disamb(disamb_file, index_geoc_file, flavor='GMAPS', inDelim='|')

Return a list of recId|geoc(target) from a list of recid|target.

Parameters:

Name Type Description Default
disamb_file str

disambiguation data file path

required
index_geoc_file str

index geocoding file path

required
flavor str

flavor of index_geoc_file (in ["HERE","GMAPS"])

'GMAPS'
inDelim str

inner delimiter

'|'

Usage:

patentcity geo add.disamb ${DISAMBFILE} ${GEOCINDEX} --flavor ${FLAVOR}

Info

Use before patentcity geo add

Source code in patentcity/geo.py
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
@app.command(name="add.disamb")
def add_geoc_disamb(
    disamb_file: str, index_geoc_file: str, flavor: str = "GMAPS", inDelim: str = "|"
):
    """Return a list of recId|geoc(target) from a list of recid|target.

    Arguments:
        disamb_file: disambiguation data file path
        index_geoc_file: index geocoding file path
        flavor: flavor of `index_geoc_file` (in ["HERE","GMAPS"])
        inDelim: inner delimiter

    **Usage:**
        ```shell
        patentcity geo add.disamb ${DISAMBFILE} ${GEOCINDEX} --flavor ${FLAVOR}
        ```

    !!! info
        Use before `patentcity geo add`
    """
    assert flavor in ["GMAPS", "HERE"]
    if flavor == "GMAPS":
        index = {}
        with open(index_geoc_file, "r") as lines:
            for line in lines:
                recid, geoc = line.split(inDelim)
                index.update({recid: json.loads(geoc)})

        with open(disamb_file, "r") as lines:
            for line in lines:
                recid, disamb_loc = line.split(inDelim)
                disamb_loc_recid = get_recid(clean_text(disamb_loc))
                typer.echo(f"{recid}{inDelim}{json.dumps(index.get(disamb_loc_recid))}")
    else:
        index = _get_geoc_index(index_geoc_file, dump=False)
        fieldnames = GEOC_OUTCOLS
        writer = csv.DictWriter(sys.stdout, fieldnames=fieldnames)
        writer.writeheader()
        with open(disamb_file, "r") as lines:
            for line in lines:
                recid, searchtext = line.replace("\n", "").split(inDelim)
                geoc_disamb = index.get(get_recid(searchtext))
                geoc_disamb.update({"recId": recid})
                writer.writerow(geoc_disamb)

add_statisticalareas(file, statisticalareas_path, verbose=False)

Return file with statistical areas to stdout.

Parameters:

Name Type Description Default
file str

file path

required
statisticalareas_path str

satistical area files path (wildcard allowed)

required
verbose bool

verbosity

False

Usage:

patentcity geo add.statisticalareas geoc_gbpatentxx.here.csv "assets/statisticalareas_*.csv"

Source code in patentcity/geo.py
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
@app.command(name="add.statisticalareas")
def add_statisticalareas(file: str, statisticalareas_path: str, verbose: bool = False):
    """Return `file` with statistical areas to stdout.

    Arguments:
         file: file path
         statisticalareas_path: satistical area files path (wildcard allowed)
         verbose: verbosity

    **Usage:**
        ```shell
        patentcity geo add.statisticalareas geoc_gbpatentxx.here.csv "assets/statisticalareas_*.csv"
        ```
    """
    statisticalareas_df = read_csv_many(
        statisticalareas_path, verbose=verbose, dtype=str
    )
    geoc_df = pd.read_csv(file, dtype=str, error_bad_lines=False)
    geoc_df = geoc_df.where(pd.notnull(geoc_df), None)  # we replace pandas nan by None
    variables = ["country", "state", "county", "city", "postalCode"]
    geoc_df["key"] = geoc_df[variables].apply(get_statisticalarea_key, axis=1)
    geoc_df = geoc_df.merge(statisticalareas_df, how="left", on=["country", "key"])
    typer.echo(geoc_df.to_csv(sys.stdout, index=False))

get_geoc_data_gmaps(file, api_key, region, language='en', max_workers=5, inDelim='|', skip_header=True)

Geocode addresses in file using GMAPS

Parameters:

Name Type Description Default
file str

file path

required
api_key str

api key

required
region str

region code, specified as a ccTLD (“top-level domain”) two-character value (e.g. de, fr, uk, us, etc).

required
language str

the language in which to return results

'en'
max_workers int

max number of workers

5
inDelim str

inner delimiter

'|'
skip_header bool

whether to ski header or not

True

Usage:

patentcity geo gmaps.get loc_uspatentxx.txt $APIKEY us

Source code in patentcity/geo.py
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
@app.command(name="gmaps.get")
def get_geoc_data_gmaps(
    file: str,
    api_key: str,
    region: str,
    language: str = "en",
    max_workers: int = 5,
    inDelim: str = "|",
    skip_header: bool = True,
):
    """Geocode addresses in `file` using GMAPS

    Arguments:
        file: file path
        api_key: api key
        region:  region code, specified as a ccTLD (“top-level domain”) two-character value (e.g. de, fr, uk, us, etc).
        language: the language in which to return results
        max_workers: max number of workers
        inDelim: inner delimiter
        skip_header: whether to ski header or not

    **Usage:**
        ```shell
        patentcity geo gmaps.get loc_uspatentxx.txt $APIKEY us
        ```

    !!! info
        - [Quickstart](https://developers.google.com/maps/documentation/geocoding/start)
        - [Overview](https://developers.google.com/maps/documentation/geocoding/overview)
        - [Language](https://developers.google.com/maps/faq#languagesupport)
    """
    gmaps = googlemaps.Client(api_key)
    with open(file, "r") as lines:
        if skip_header:
            next(lines)
        with ThreadPoolExecutor(max_workers) as executor:
            executor.map(
                _get_geoc_data_gmaps,
                lines,
                repeat(gmaps),
                repeat(region),
                repeat(language),
                repeat(inDelim),
            )

get_geoc_data_here(request_id, api_key, output_dir=None, unzip=True)

Download and save HERE geocoded data to output_dir/request_id.zip

Parameters:

Name Type Description Default
request_id str

HERE job request ID (returned by `here.post)

required
api_key str

HERE api key

required
output_dir str

saving directory

None
unzip bool

whether to unzip the output

True

Usage:

patentcity geo here.get $REQUESTID $APIKEY --output-dir <your-dir>

Source code in patentcity/geo.py
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
@app.command(name="here.get")
def get_geoc_data_here(
    request_id: str, api_key: str, output_dir: str = None, unzip: bool = True
):
    """Download and save HERE geocoded data to `output_dir`/`request_id`.zip

    Arguments:
        request_id: HERE job request ID (returned by `here.post)
        api_key: HERE api key
        output_dir: saving directory
        unzip: whether to unzip the output

    **Usage:**
        ```shell
        patentcity geo here.get $REQUESTID $APIKEY --output-dir <your-dir>
        ```

    !!! info
        - [Read output](https://developer.here.com/documentation/batch-geocoder/dev_guide/topics/read-batch-request-output.html)
    """

    def dump_data(response, output_file):
        with open(output_file, "wb") as fout:
            fout.write(response.content)
            typer.secho(f"{ok}{output_file}", fg=typer.colors.GREEN)

    def unzip_data(zip_file):
        unzip_dir = os.path.splitext(zip_file)[0]
        with ZipFile(zip_file, "r") as zipObj:
            # Extract all the contents of zip file in different directory
            zipObj.extractall(unzip_dir)
            typer.secho(f"{ok}{zip_file} unzipped", fg=typer.colors.GREEN)

    output_file = os.path.join(output_dir, f"{request_id}.zip")
    params = (("apiKey", api_key),)
    response = requests.get(
        f"https://batch.geocoder.ls.hereapi.com/6.2/jobs/{request_id}/result/",
        params=params,
    )
    if response.status_code == 200:
        dump_data(response, output_file)
        if unzip:
            unzip_data(output_file)
    else:
        typer.secho(
            f"{not_ok}Failed with status {response.status_code}\n{response.content}",
            fg=typer.colors.RED,
        )

get_geoc_status_here(request_id, api_key, freq=5, verbose=False)

Check status of job request_id every freq seconds

Parameters:

Name Type Description Default
request_id str

HERE job request ID (returned by `here.post)

required
api_key str

HERE api key

required
freq int

interval between 2 consecutive status updates

5
verbose bool

verbosity

False

Usage:

patentcity geo here.status $REQUESTID $APIKEY

Source code in patentcity/geo.py
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
@app.command(name="here.status")
def get_geoc_status_here(
    request_id: str, api_key: str, freq: int = 5, verbose: bool = False
):
    """Check status of job `request_id` every `freq` seconds

    Arguments:
        request_id: HERE job request ID (returned by `here.post)
        api_key: HERE api key
        freq: interval between 2 consecutive status updates
        verbose: verbosity

    **Usage:**
        ```shell
        patentcity geo here.status $REQUESTID $APIKEY
        ```
    """

    def summarize_status(response, verbose):
        soup = BeautifulSoup(response.text, "xml")
        now = get_dt_human()
        Status = soup.Status.text
        TotalCount = soup.TotalCount.text
        ProcessedCount = soup.ProcessedCount.text
        PendingCount = soup.PendingCount.text
        ErrorCount = soup.ErrorCount.text
        SuccessCount = soup.SuccessCount.text
        typer.secho(
            f"{now}: {ProcessedCount}/{TotalCount} ({PendingCount} pending)",
            fg=typer.colors.BLUE,
        )
        if int(SuccessCount) > 0:
            typer.secho(
                f"{ok}{SuccessCount} addresses successfully geocoded",
                fg=typer.colors.GREEN,
            )
        if int(ErrorCount) > 0:
            typer.secho(f"{not_ok}{ErrorCount} errors detected", fg=typer.colors.RED)
        if verbose:
            typer.echo(soup.prettify())
        return Status

    params = (("action", "status"), ("apiKey", api_key))
    completed = False
    while not completed:
        response = requests.get(f"{GEOC_URL}/{request_id}", params=params)
        Status = summarize_status(response, verbose)
        if Status == "completed":
            completed = True
            typer.secho(f"{ok}{ok}Job completed", fg=typer.colors.GREEN)
        else:
            typer.secho(f"Status:{Status}", fg=typer.colors.BLUE)
            time.sleep(freq)

get_parsed_loc_libpostal(path, api_reference, max_workers=10, debug=False)

Send data in path to libpostal service (hosted at api_reference) and return parsed loc json blobs to stdout.

Parameters:

Name Type Description Default
path str

data path (wildcard allowed)

required
api_reference str

reference of service host "ip:port"

required
max_workers int

max number of workers

10
debug bool

verbosity degree

False

Usage:

patentcity geo libpostal.get <your-addresses.txt> <ip:port>

Source code in patentcity/geo.py
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
@app.command(deprecated=True, name="libpostal.get")
def get_parsed_loc_libpostal(
    path: str, api_reference: str, max_workers: int = 10, debug: bool = False
):
    """
    Send data in `path` to libpostal service (hosted at `api_reference`)
    and return parsed loc json blobs to stdout.

    Arguments:
        path: data path (wildcard allowed)
        api_reference: reference of service host "ip:port"
        max_workers: max number of workers
        debug: verbosity degree

    **Usage:**
        ```shell
        patentcity geo libpostal.get <your-addresses.txt> <ip:port>
        ```
    """
    files = glob(path)
    for file in files:
        data = open(file, "r")
        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            executor.map(_parse_loc_blob, data, repeat(api_reference), repeat(debug))

harmonize_geoc_data_gmaps(file, inDelim='|', out_format='csv', header=True)

Harmonize Gmaps response with HERE Geocoding API responses (csv)

Parameters:

Name Type Description Default
file str

file path

required
inDelim str

inner delimiter

'|'
out_format str

format of the output (in ["csv", "jsonl"])

'csv'
header bool

whether to add a header (if out_format is "csv")

True

Usage:

patentcity geo gmaps.harmonize geoc_uspatentxx.gmaps.jsonl

Source code in patentcity/geo.py
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
@app.command(name="gmaps.harmonize")
def harmonize_geoc_data_gmaps(
    file: str, inDelim: str = "|", out_format: str = "csv", header: bool = True
):
    """Harmonize Gmaps response with HERE Geocoding API responses (csv)

    Arguments:
        file: file path
        inDelim: inner delimiter
        out_format: format of the output (in ["csv", "jsonl"])
        header: whether to add a header (if `out_format` is "csv")

    **Usage:**
        ```shell
        patentcity geo gmaps.harmonize geoc_uspatentxx.gmaps.jsonl
        ```
    """

    assert out_format in ["csv", "jsonl"]
    iso_crossover = get_isocrossover()
    us_state_crossover = get_usstatecrossover()
    county_crossover = get_countycrossover()

    if out_format == "csv" and header:
        csvwriter = csv.DictWriter(sys.stdout, GEOC_OUTCOLS)
        csvwriter.writeheader()

    with open(file, "r") as lines:
        for line in lines:
            line = clean_text(line, inDelim=f" {inDelim} ")
            # clean cases like "Jack A. Claes Pavilion | Elk Grove Park District" returned by Gmaps

            try:
                recid, response = line.split(inDelim)
                _parse_response_gmaps(
                    response,
                    recid,
                    out_format,
                    iso_crossover,
                    us_state_crossover,
                    county_crossover,
                )
            except ValueError:
                pass

post_geoc_data_here(file, api_key, countryfocus, outCols=None, inDelim='|', outDelim=',', locationattributes='addressDetails', language='en-EN', includeinputfields=False, verbose=False)

Post file to HERE batch geocoding API

Parameters:

Name Type Description Default
file str

file path. File is expected to be formatted as follows recId|searchText

required
api_key str

HERE api key

required
countryfocus str

iso3 country code (e.g. deu, fra, gbr, usa, etc), see Format input

required
outCols str

see Request parameters

None
inDelim str

see Request parameters

'|'
outDelim str

see Request parameters

','
locationattributes str

see Request parameters

'addressDetails'
language str

output language, see Request parameters

'en-EN'
includeinputfields bool

see Request parameters

False
verbose bool

verbosity

False

Usage:

patentcity geo here.post loc_uspatentxx.txt $APIKEY usa

Source code in patentcity/geo.py
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
@app.command(name="here.post")
def post_geoc_data_here(
    file: str,
    api_key: str,
    countryfocus: str,  # ISO3?
    outCols: str = None,  # pylint: disable=invalid-name
    inDelim: str = "|",  # pylint: disable=invalid-name
    outDelim: str = ",",  # pylint: disable=invalid-name
    locationattributes: str = "addressDetails",
    language: str = "en-EN",  # eg "en-EN", "en-US"
    includeinputfields: bool = False,  # False for downstream compatibility
    verbose: bool = False,
):
    """Post `file` to HERE batch geocoding API

    Arguments:
        file: file path. File is expected to be formatted as follows recId|searchText
        api_key: HERE api key
        countryfocus: iso3 country code (e.g. deu, fra, gbr, usa, etc), see [Format input](https://developer.here.com/documentation/batch-geocoder/dev_guide/topics/data-input.html)
        outCols: see [Request parameters](https://developer.here.com/documentation/batch-geocoder/dev_guide/topics/request-parameters.html)
        inDelim: see [Request parameters](https://developer.here.com/documentation/batch-geocoder/dev_guide/topics/request-parameters.html)
        outDelim: see [Request parameters](https://developer.here.com/documentation/batch-geocoder/dev_guide/topics/request-parameters.html)
        locationattributes: see [Request parameters](https://developer.here.com/documentation/batch-geocoder/dev_guide/topics/request-parameters.html)
        language: output language, see [Request parameters](https://developer.here.com/documentation/batch-geocoder/dev_guide/topics/request-parameters.html)
        includeinputfields: see [Request parameters](https://developer.here.com/documentation/batch-geocoder/dev_guide/topics/request-parameters.html)
        verbose: verbosity

    **Usage:**
        ```shell
        patentcity geo here.post loc_uspatentxx.txt $APIKEY usa
        ```

    !!! info
        - [Format input](https://developer.here.com/documentation/batch-geocoder/dev_guide/topics/data-input.html)
        - [Request parameters](https://developer.here.com/documentation/batch-geocoder/dev_guide/topics/request-parameters.html)
    """

    def check_post(response):
        soup = BeautifulSoup(response.text, features="xml")
        RequestId = soup.RequestId.text
        Status = soup.Status.text
        log_msg = f"{file}\t{Status}\t{RequestId}\t{get_dt_human()}"
        if verbose:
            typer.echo(soup.prettify())
        if Status == "accepted":
            typer.secho(f"{ok}{log_msg}", fg=typer.colors.GREEN)
        else:
            typer.secho(f"{not_ok}\t{log_msg}", fg=typer.colors.RED)

    headers = {"Content-Type": "text/plain"}
    outCols = outCols.split(",") if outCols else GEOC_OUTCOLS

    # Remove default columns to avoid duplicated columns
    for col in ["recID", "seqNumber", "seqLength"]:
        try:
            outCols.remove(col)
        except ValueError:
            pass

    params = (
        ("apiKey", api_key),
        ("action", "run"),
        ("header", "true"),
        ("inDelim", inDelim),
        ("outDelim", outDelim),
        ("outCols", ",".join(outCols)),
        ("outputcombined", "true"),
        ("countryfocus", countryfocus),
        ("language", language),
        ("locationattributes", locationattributes),
        ("includeinputfields", includeinputfields),
    )

    data = open(file, "rb").read()
    response = requests.post(GEOC_URL, headers=headers, params=params, data=data)
    if response.status_code == 200:
        check_post(response)
    else:
        typer.secho(
            f"{not_ok}Failed with status {response.status_code}\n{response.content}",
            fg=typer.colors.RED,
        )

prep_geoc_data(file, inDelim='|')

Return patentees' loc data formatted for geocoding to stdout (recId|searchText).

Parameters:

Name Type Description Default
file str

file path

required
inDelim str

inner delimiter used by HERE

'|'

Usage:

patentcity geo prep entrel_uspatent01.jsonl
#Sort and deduplicate addresses before batch geocoding
sort -u loc_uspatent01.txt

Source code in patentcity/geo.py
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
@app.command(name="prep")
def prep_geoc_data(file: str, inDelim: str = "|"):
    """Return patentees' loc data formatted for geocoding to stdout (recId|searchText).

    Arguments:
        file: file path
        inDelim: inner delimiter used by HERE

    **Usage:**
        ```shell
        patentcity geo prep entrel_uspatent01.jsonl
        #Sort and deduplicate addresses before batch geocoding
        sort -u loc_uspatent01.txt
        ```
    """
    with open(file, "r") as lines:
        typer.echo(f"recId{inDelim}searchText")  # This is the required header
        for line in lines:
            line = json.loads(line)
            patentees = line.get("patentee")
            for patentee in patentees:
                loc_recid = patentee.get("loc_recId")
                loc_text = patentee.get("loc_text")
                if loc_recid and loc_text:
                    typer.echo(f"{loc_recid}{inDelim}{loc_text}")