Data extraction

How to successfully extract data from documents

Before extracting data you need to:

Once you successfully completed the above steps, you can use the /extract-data endpoint to extract data. We prepared a sample code to help you out:

1 Open file as base64 string (Lines 4-6)

Open the file in a binary mode and correctly decode it into a base64 string. Make sure that your file is in the same directory as the script.

2 Create payload (Lines 8-12)

Create request payload with all the required parameters:

file
file_name
document_type_name

3 Specify headers (Line 16-20)

Make sure that the Content-Type is set as application/json.

4 Authorize with your API key (Line 19)

You can get your API key at https://app.typless.com/settings/profile

5 Execute the request (Line 22)

Send the request and wait for the response.

import requests
import base64

file_name = 'name_of_your_document.pdf'
with open(file_name, 'rb') as file:
    base64_data = base64.b64encode(file.read()).decode('utf-8')

payload = {
    "file": base64_data,
    "file_name": file_name,
    "document_type_name": "line-item-invoice"
}

url = "https://developers.typless.com/api/extract-data"

headers = {
    "Accept": "application/json",
    "Content-Type": "application/json",
    "Authorization": "<<apiKey>>"
}

response = requests.request("POST", url, json=payload, headers=headers)

print(response.json())

const fetch = require('node-fetch');
const fs = require('fs');

const fileName = 'name_of_your_document.pdf';
const base64File = fs.readFileSync(fileName, {encoding: 'base64'});

const url = 'https://developers-development.typless.com/api/extract-data';

const payload = {
  file: base64File,
  file_name: fileName,
  document_type_name: "line-item-invoice"
}

const headers = {
  'Accept': 'application/json',
  'Content-Type': 'application/json',
  'Authorization': '<<apiKey>>'
}

let options = {
  method: 'POST',
  headers: headers,
  body: JSON.stringify(payload)
};

fetch(url, options)
  .then(res => res.json())
  .then(json => console.log(JSON.stringify(json)))
  .catch(err => console.error('error:' + err));

Response:

{
    "file_name": "name_of_your_document.pdf",
    "object_id": "1cb25cc8-c9fa-4149-9a83-b4ed6a2173b9",
    "extracted_fields": [
        {
            "name": "supplier",
            "values": [
                {
                    "x": -1,
                    "y": -1,
                    "width": -1,
                    "height": -1,
                    "value": "ScaleGrid",
                    "confidence_score": "0.968",
                    "page_number": -1
                }
            ],
            "data_type": "AUTHOR"
        },
        {
            "name": "invoice_number",
            "values": [
                {
                    "x": 1989,
                    "y": 545,
                    "width": 323,
                    "height": 54,
                    "value": "20190500005890",
                    "confidence_score": "0.250",
                    "page_number": 0
                },
                {
                    "x": 167,
                    "y": 574,
                    "width": 391,
                    "height": 54,
                    "value": "GB123456789",
                    "confidence_score": "0.250",
                    "page_number": 0
                }
            ],
            "data_type": "STRING"
        },
        {
            "name": "issue_date",
            "values": [
                {
                    "x": 2072,
                    "y": 628,
                    "width": 240,
                    "height": 54,
                    "value": "2019-06-05",
                    "confidence_score": "0.358",
                    "page_number": 0
                }
            ],
            "data_type": "DATE"
        },
        {
            "name": "total_amount",
            "values": [
                {
                    "x": 2146,
                    "y": 1196,
                    "width": 126,
                    "height": 54,
                    "value": "47.5300",
                    "confidence_score": "0.990",
                    "page_number": 0
                }
            ],
            "data_type": "NUMBER"
        }
    ],
    "line_items": [
        [
            {
                "name": "Description",
                "values": [
                    {
                        "x": 208,
                        "y": 1196,
                        "width": 1022,
                        "height": 50,
                        "value": "5/2019-MongoBackend-MgmtStandalone-Small-744 hours",
                        "confidence_score": "0.661",
                        "page_number": 0
                    }
                ],
                "data_type": "STRING"
            },
            {
                "name": "Price",
                "values": [
                    {
                        "x": 2146,
                        "y": 1196,
                        "width": 126,
                        "height": 54,
                        "value": "47.5300",
                        "confidence_score": "0.582",
                        "page_number": 0
                    }
                ],
                "data_type": "NUMBER"
            },
            {
                "name": "Quantity",
                "values": [
                    {
                        "x": 1979,
                        "y": 1196,
                        "width": 23,
                        "height": 54,
                        "value": "1",
                        "confidence_score": "0.647",
                        "page_number": 0
                    }
                ],
                "data_type": "NUMBER"
            }
        ]
    ],
    "customer": null
}

Supported file types

Typless supports the following file types:

PDF
JPG
PNG
TIFF

If you are working with scanned documents, we recommend using a resolution of 300 DPI to achieve optimal results.

🚧 Are you having problems with document quality?

We wrote a short blog on how to solve your problems here.

Line item extraction

If you want to extract line items, make sure you have defined the line-item structure in the document type.

Request parameters

Param

Type

Required

Details

document_type_name

string

YES

Name of the document type that you use for extraction.

file

string (Base64 encoded)

YES

The original file of the document that you are extracting data from.

file_name

string

YES

Name of the original file of the document that you are extracting data from. Name must include file type suffix, e.g., document.pdf

customer

string

Your internal customer identification, used for .csv usage report. e.g., "my customer"

Understanding response

In the response, you will always find all of the fields you defined in the document type. Values for fields will be sorted by the confidence score. If the field is not present on the document, the value will be set to null.

Response base params

Param

Type

Details

file_name

string

Same value as provided in request

object_id

string

Id of document for sending feedback to dataset.

extracted_fields

list

List of extracted fields

customer

string

Same value as provided in request

Extracted fields params

Param

Type

Behaviour

name

string

Name of the field you defined in the document type

values

list

List of values for the field

data_type

string

Type of the field you defined in the document type

Extracted values params

Param

Type

Behaviour

int

Top left bounding box corner. If value is null this value will be -1

int

Top left bounding box corner. If value is null this bounding box corner value will be -1

width

int

Bounding box width. If value is null this width will be -1

height

int

Bounding box height. If value is null this height will be -1

value

string

Value for field in standard format

confidence_score

string

Value between 0 and 1. Bigger the value more confident the system is

page_number

int

Page on which value is present. If value is null this page number will be -1

📘 Field supplier_name will never carry any positional data - Just value!

All the positional parameters will always be -1, except for confidence_score and the value.

Example full response

{
    "file_name": "invoice_2.pdf",
    "object_id": "1cb25cc8-c9fa-4149-9a83-b4ed6a2173b9",
    "extracted_fields": [
        {
            "name": "supplier",
            "values": [
                {
                    "x": -1,
                    "y": -1,
                    "width": -1,
                    "height": -1,
                    "value": "ScaleGrid",
                    "confidence_score": "0.968",
                    "page_number": -1
                }
            ],
            "data_type": "AUTHOR"
        },
        {
            "name": "invoice_number",
            "values": [
                {
                    "x": 1989,
                    "y": 545,
                    "width": 323,
                    "height": 54,
                    "value": "20190500005890",
                    "confidence_score": "0.250",
                    "page_number": 0
                },
                {
                    "x": 167,
                    "y": 574,
                    "width": 391,
                    "height": 54,
                    "value": "GB123456789",
                    "confidence_score": "0.250",
                    "page_number": 0
                }
            ],
            "data_type": "STRING"
        },
        {
            "name": "issue_date",
            "values": [
                {
                    "x": 2072,
                    "y": 628,
                    "width": 240,
                    "height": 54,
                    "value": "2019-06-05",
                    "confidence_score": "0.358",
                    "page_number": 0
                }
            ],
            "data_type": "DATE"
        },
        {
            "name": "total_amount",
            "values": [
                {
                    "x": 2146,
                    "y": 1196,
                    "width": 126,
                    "height": 54,
                    "value": "47.5300",
                    "confidence_score": "0.990",
                    "page_number": 0
                }
            ],
            "data_type": "NUMBER"
        }
    ],
    "line_items": [
        [
            {
                "name": "Description",
                "values": [
                    {
                        "x": 208,
                        "y": 1196,
                        "width": 1022,
                        "height": 50,
                        "value": "5/2019-MongoBackend-MgmtStandalone-Small-744 hours",
                        "confidence_score": "0.661",
                        "page_number": 0
                    }
                ],
                "data_type": "STRING"
            },
            {
                "name": "Price",
                "values": [
                    {
                        "x": 2146,
                        "y": 1196,
                        "width": 126,
                        "height": 54,
                        "value": "47.5300",
                        "confidence_score": "0.582",
                        "page_number": 0
                    }
                ],
                "data_type": "NUMBER"
            },
            {
                "name": "Quantity",
                "values": [
                    {
                        "x": 1979,
                        "y": 1196,
                        "width": 23,
                        "height": 54,
                        "value": "1",
                        "confidence_score": "0.647",
                        "page_number": 0
                    }
                ],
                "data_type": "NUMBER"
            }
        ]
    ],
    "customer": null
}

Example null response:

{
    "file_name": "invoice.pdf",
    "object_id": "26e01d82-e7f4-48d3-a902-b74283b73279",
    "extracted_fields": [
        {
            "name": "total_amount",
            "values": [
                {
                    "x": -1,
                    "y": -1,
                    "width": -1,
                    "height": -1,
                    "value": null,
                    "confidence_score": "0.000",
                    "page_number": -1
                }
            ],
            "data_type": "NUMBER"
        },
        {
            "name": "invoice_number",
            "values": [
                {
                    "x": -1,
                    "y": -1,
                    "width": -1,
                    "height": -1,
                    "value": null,
                    "confidence_score": "0.000",
                    "page_number": -1
                }
            ],
            "data_type": "STRING"
        },
        {
            "name": "issue_date",
            "values": [
                {
                    "x": -1,
                    "y": -1,
                    "width": -1,
                    "height": -1,
                    "value": null,
                    "confidence_score": "0.000",
                    "page_number": -1
                }
            ],
            "data_type": "DATE"
        },
        {
            "name": "supplier",
            "values": [
                {
                    "x": -1,
                    "y": -1,
                    "width": -1,
                    "height": -1,
                    "value": null,
                    "confidence_score": "0.000",
                    "page_number": -1
                }
            ],
            "data_type": "AUTHOR"
        }
    ],
    "line_items": [],
    "customer": null
}

PreviousManaging models NextAsynchronous extraction

Last updated 6 months ago

hashtagSupported file types

hashtagLine item extraction

hashtagRequest parameters

hashtagUnderstanding response

hashtagResponse base params

hashtagExtracted fields params

hashtagExtracted values params

hashtagExample full response

hashtagExample null response:

Supported file types

Line item extraction

Request parameters

Understanding response

Response base params

Extracted fields params

Extracted values params

Example full response

Example null response: