# Data extraction

Before extracting data you need to:

1. [Create a document type](https://typless.gitbook.io/typlessapi/typless/document-type)
2. [Build a dataset](https://typless.gitbook.io/typlessapi/typless/training/building-a-dataset)
3. [Train the document type](https://typless.gitbook.io/typlessapi/typless/training)

Once you successfully completed the above steps, you can use the [/extract-data](https://typless.gitbook.io/typlessapi/api-docs/api-schema#api-extract-data) endpoint to extract data.\
We prepared a sample code to help you out:

<details>

<summary><strong>1 Open file as base64 string</strong> <em><mark style="color:green;">(Lines 4-6)</mark></em></summary>

Open the file in a binary mode and correctly decode it into a base64 string.\
Make sure that your file is in the same directory as the script.

</details>

<details>

<summary><strong>2 Create payload</strong> <em><mark style="color:green;">(Lines 8-12)</mark></em></summary>

Create request payload with all the required parameters:

* file
* file\_name
* document\_type\_name

</details>

<details>

<summary><strong>3 Specify headers</strong> <em><mark style="color:green;">(Line 16-20)</mark></em></summary>

Make sure that the Content-Type is set as application/json.

</details>

<details>

<summary><strong>4 Authorize with your API key</strong> <em><mark style="color:green;">(Line 19)</mark></em></summary>

You can get your API key at <https://app.typless.com/settings/profile>

</details>

<details>

<summary><strong>5 Execute the request</strong> <em><mark style="color:green;">(Line 22)</mark></em></summary>

Send the request and wait for the response.

</details>

{% tabs %}
{% tab title="Python" %}
{% code lineNumbers="true" %}

```python
import requests
import base64

file_name = 'name_of_your_document.pdf'
with open(file_name, 'rb') as file:
    base64_data = base64.b64encode(file.read()).decode('utf-8')

payload = {
    "file": base64_data,
    "file_name": file_name,
    "document_type_name": "line-item-invoice"
}

url = "https://developers.typless.com/api/extract-data"

headers = {
    "Accept": "application/json",
    "Content-Type": "application/json",
    "Authorization": "<<apiKey>>"
}

response = requests.request("POST", url, json=payload, headers=headers)

print(response.json())
```

{% endcode %}
{% endtab %}

{% tab title="Node" %}
{% code lineNumbers="true" %}

```javascript
const fetch = require('node-fetch');
const fs = require('fs');

const fileName = 'name_of_your_document.pdf';
const base64File = fs.readFileSync(fileName, {encoding: 'base64'});

const url = 'https://developers-development.typless.com/api/extract-data';

const payload = {
  file: base64File,
  file_name: fileName,
  document_type_name: "line-item-invoice"
}

const headers = {
  'Accept': 'application/json',
  'Content-Type': 'application/json',
  'Authorization': '<<apiKey>>'
}

let options = {
  method: 'POST',
  headers: headers,
  body: JSON.stringify(payload)
};

fetch(url, options)
  .then(res => res.json())
  .then(json => console.log(JSON.stringify(json)))
  .catch(err => console.error('error:' + err));
```

{% endcode %}
{% endtab %}
{% endtabs %}

Response:

{% tabs %}
{% tab title="JSON" %}
{% code lineNumbers="true" %}

```json
{
    "file_name": "name_of_your_document.pdf",
    "object_id": "1cb25cc8-c9fa-4149-9a83-b4ed6a2173b9",
    "extracted_fields": [
        {
            "name": "supplier",
            "values": [
                {
                    "x": -1,
                    "y": -1,
                    "width": -1,
                    "height": -1,
                    "value": "ScaleGrid",
                    "confidence_score": "0.968",
                    "page_number": -1
                }
            ],
            "data_type": "AUTHOR"
        },
        {
            "name": "invoice_number",
            "values": [
                {
                    "x": 1989,
                    "y": 545,
                    "width": 323,
                    "height": 54,
                    "value": "20190500005890",
                    "confidence_score": "0.250",
                    "page_number": 0
                },
                {
                    "x": 167,
                    "y": 574,
                    "width": 391,
                    "height": 54,
                    "value": "GB123456789",
                    "confidence_score": "0.250",
                    "page_number": 0
                }
            ],
            "data_type": "STRING"
        },
        {
            "name": "issue_date",
            "values": [
                {
                    "x": 2072,
                    "y": 628,
                    "width": 240,
                    "height": 54,
                    "value": "2019-06-05",
                    "confidence_score": "0.358",
                    "page_number": 0
                }
            ],
            "data_type": "DATE"
        },
        {
            "name": "total_amount",
            "values": [
                {
                    "x": 2146,
                    "y": 1196,
                    "width": 126,
                    "height": 54,
                    "value": "47.5300",
                    "confidence_score": "0.990",
                    "page_number": 0
                }
            ],
            "data_type": "NUMBER"
        }
    ],
    "line_items": [
        [
            {
                "name": "Description",
                "values": [
                    {
                        "x": 208,
                        "y": 1196,
                        "width": 1022,
                        "height": 50,
                        "value": "5/2019-MongoBackend-MgmtStandalone-Small-744 hours",
                        "confidence_score": "0.661",
                        "page_number": 0
                    }
                ],
                "data_type": "STRING"
            },
            {
                "name": "Price",
                "values": [
                    {
                        "x": 2146,
                        "y": 1196,
                        "width": 126,
                        "height": 54,
                        "value": "47.5300",
                        "confidence_score": "0.582",
                        "page_number": 0
                    }
                ],
                "data_type": "NUMBER"
            },
            {
                "name": "Quantity",
                "values": [
                    {
                        "x": 1979,
                        "y": 1196,
                        "width": 23,
                        "height": 54,
                        "value": "1",
                        "confidence_score": "0.647",
                        "page_number": 0
                    }
                ],
                "data_type": "NUMBER"
            }
        ]
    ],
    "customer": null
}
```

{% endcode %}
{% endtab %}
{% endtabs %}

## Supported file types

Typless supports the following file types:

* **PDF**
* **JPG**
* **PNG**
* **TIFF**

If you are working with scanned documents, we recommend using a resolution of **300 DPI** to achieve optimal results.

{% hint style="warning" %} <mark style="color:orange;">**🚧 Are you having problems with document quality?**</mark>

We wrote a short blog on how to solve your problems [here](https://typless.com/intelligent-document-processing-scan-recognition/).
{% endhint %}

## Line item extraction

If you want to extract line items, make sure you have defined the line-item structure in the [document type](https://typless.gitbook.io/typlessapi/typless/document-type).

## Request parameters

<table><thead><tr><th width="216">Param</th><th width="132">Type</th><th width="104">Required</th><th>Details</th></tr></thead><tbody><tr><td>document_type_name</td><td>string</td><td>YES</td><td>Name of the document type that you use for extraction.</td></tr><tr><td>file</td><td>string (Base64 encoded)</td><td>YES</td><td>The original file of the document that you are extracting data from.</td></tr><tr><td>file_name</td><td>string</td><td>YES</td><td>Name of the original file of the document that you are extracting data from. <br>Name must include file type suffix, e.g., document.pdf</td></tr><tr><td>customer</td><td>string</td><td>NO</td><td>Your internal customer identification, used for .csv usage report. e.g., "my customer"</td></tr></tbody></table>

## Understanding response

In the response, you will always find all of the fields you defined in the document type. Values for fields will be sorted by the confidence score. If the field is not present on the document, the value will be set to **null**.

### Response base params

<table><thead><tr><th width="188">Param</th><th width="150">Type</th><th>Details</th></tr></thead><tbody><tr><td>file_name</td><td>string</td><td>Same value as provided in request</td></tr><tr><td>object_id</td><td>string</td><td>Id of document for <a href="../training/building-a-dataset#using-live-data">sending feedback to dataset</a>.</td></tr><tr><td>extracted_fields</td><td>list</td><td>List of extracted fields</td></tr><tr><td>customer</td><td>string</td><td>Same value as provided in request</td></tr></tbody></table>

### Extracted fields params

<table><thead><tr><th width="189">Param</th><th width="149">Type</th><th>Behaviour</th></tr></thead><tbody><tr><td>name</td><td>string</td><td>Name of the <a href="extraction-fields">field</a> you defined in the <a href="document-type">document type</a></td></tr><tr><td>values</td><td>list</td><td>List of values for the field</td></tr><tr><td>data_type</td><td>string</td><td>Type of the <a href="extraction-fields">field</a> you defined in the <a href="document-type">document type</a></td></tr></tbody></table>

### Extracted values params

<table><thead><tr><th width="177">Param</th><th width="95">Type</th><th>Behaviour</th></tr></thead><tbody><tr><td>x</td><td>int</td><td>Top left bounding box corner. If value is null this value will be -1</td></tr><tr><td>y</td><td>int</td><td>Top left bounding box corner. If value is null this bounding box corner value will be -1</td></tr><tr><td>width</td><td>int</td><td>Bounding box width. If value is null this width will be -1</td></tr><tr><td>height</td><td>int</td><td>Bounding box height. If value is null this height will be -1</td></tr><tr><td>value</td><td>string</td><td>Value for <a href="extraction-fields">field in standard format</a></td></tr><tr><td>confidence_score</td><td>string</td><td>Value between 0 and 1. Bigger the value more confident the system is</td></tr><tr><td>page_number</td><td>int</td><td>Page on which value is present. If value is null this page number will be -1</td></tr></tbody></table>

{% hint style="info" %}
**📘&#x20;**<mark style="color:blue;">**Field**</mark><mark style="color:blue;">**&#x20;**</mark>*<mark style="color:blue;">**supplier\_name**</mark>*<mark style="color:blue;">**&#x20;**</mark><mark style="color:blue;">**will never carry any positional data - Just value!**</mark>

All the positional parameters will always be -1, except for confidence\_score and the value.
{% endhint %}

### Example full response

{% tabs %}
{% tab title="JSON" %}
{% code lineNumbers="true" %}

```json
{
    "file_name": "invoice_2.pdf",
    "object_id": "1cb25cc8-c9fa-4149-9a83-b4ed6a2173b9",
    "extracted_fields": [
        {
            "name": "supplier",
            "values": [
                {
                    "x": -1,
                    "y": -1,
                    "width": -1,
                    "height": -1,
                    "value": "ScaleGrid",
                    "confidence_score": "0.968",
                    "page_number": -1
                }
            ],
            "data_type": "AUTHOR"
        },
        {
            "name": "invoice_number",
            "values": [
                {
                    "x": 1989,
                    "y": 545,
                    "width": 323,
                    "height": 54,
                    "value": "20190500005890",
                    "confidence_score": "0.250",
                    "page_number": 0
                },
                {
                    "x": 167,
                    "y": 574,
                    "width": 391,
                    "height": 54,
                    "value": "GB123456789",
                    "confidence_score": "0.250",
                    "page_number": 0
                }
            ],
            "data_type": "STRING"
        },
        {
            "name": "issue_date",
            "values": [
                {
                    "x": 2072,
                    "y": 628,
                    "width": 240,
                    "height": 54,
                    "value": "2019-06-05",
                    "confidence_score": "0.358",
                    "page_number": 0
                }
            ],
            "data_type": "DATE"
        },
        {
            "name": "total_amount",
            "values": [
                {
                    "x": 2146,
                    "y": 1196,
                    "width": 126,
                    "height": 54,
                    "value": "47.5300",
                    "confidence_score": "0.990",
                    "page_number": 0
                }
            ],
            "data_type": "NUMBER"
        }
    ],
    "line_items": [
        [
            {
                "name": "Description",
                "values": [
                    {
                        "x": 208,
                        "y": 1196,
                        "width": 1022,
                        "height": 50,
                        "value": "5/2019-MongoBackend-MgmtStandalone-Small-744 hours",
                        "confidence_score": "0.661",
                        "page_number": 0
                    }
                ],
                "data_type": "STRING"
            },
            {
                "name": "Price",
                "values": [
                    {
                        "x": 2146,
                        "y": 1196,
                        "width": 126,
                        "height": 54,
                        "value": "47.5300",
                        "confidence_score": "0.582",
                        "page_number": 0
                    }
                ],
                "data_type": "NUMBER"
            },
            {
                "name": "Quantity",
                "values": [
                    {
                        "x": 1979,
                        "y": 1196,
                        "width": 23,
                        "height": 54,
                        "value": "1",
                        "confidence_score": "0.647",
                        "page_number": 0
                    }
                ],
                "data_type": "NUMBER"
            }
        ]
    ],
    "customer": null
}
```

{% endcode %}
{% endtab %}
{% endtabs %}

### Example null response:

{% tabs %}
{% tab title="JSON" %}
{% code lineNumbers="true" %}

```json
{
    "file_name": "invoice.pdf",
    "object_id": "26e01d82-e7f4-48d3-a902-b74283b73279",
    "extracted_fields": [
        {
            "name": "total_amount",
            "values": [
                {
                    "x": -1,
                    "y": -1,
                    "width": -1,
                    "height": -1,
                    "value": null,
                    "confidence_score": "0.000",
                    "page_number": -1
                }
            ],
            "data_type": "NUMBER"
        },
        {
            "name": "invoice_number",
            "values": [
                {
                    "x": -1,
                    "y": -1,
                    "width": -1,
                    "height": -1,
                    "value": null,
                    "confidence_score": "0.000",
                    "page_number": -1
                }
            ],
            "data_type": "STRING"
        },
        {
            "name": "issue_date",
            "values": [
                {
                    "x": -1,
                    "y": -1,
                    "width": -1,
                    "height": -1,
                    "value": null,
                    "confidence_score": "0.000",
                    "page_number": -1
                }
            ],
            "data_type": "DATE"
        },
        {
            "name": "supplier",
            "values": [
                {
                    "x": -1,
                    "y": -1,
                    "width": -1,
                    "height": -1,
                    "value": null,
                    "confidence_score": "0.000",
                    "page_number": -1
                }
            ],
            "data_type": "AUTHOR"
        }
    ],
    "line_items": [],
    "customer": null
}
```

{% endcode %}
{% endtab %}
{% endtabs %}
