POST
/
uploadfile

Authorizations

authorization
string
headerrequired

token <token>, corresponds to temporary access tokens.

Query Parameters

chunk_size
integer | null

Chunk size in tiktoken tokens to be used when processing file.

chunk_overlap
integer | null

Chunk overlap in tiktoken tokens to be used when processing file.

skip_embedding_generation
boolean
default: false

Flag to control whether or not embeddings should be generated and stored when processing file.

set_page_as_boundary
boolean
default: false

Flag to control whether or not to set the a page's worth of content as the maximum amount of content that can appear in a chunk. Only valid for PDFs. See description route description for more information.

embedding_model
default: OPENAI

Embedding model that will be used to embed file chunks.

Available options:
OPENAI,
AZURE_OPENAI,
COHERE_MULTILINGUAL_V3,
OPENAI_ADA_LARGE_256,
OPENAI_ADA_LARGE_1024,
OPENAI_ADA_LARGE_3072,
OPENAI_ADA_SMALL_512,
OPENAI_ADA_SMALL_1536,
AZURE_ADA_LARGE_256,
AZURE_ADA_LARGE_1024,
AZURE_ADA_LARGE_3072,
AZURE_ADA_SMALL_512,
AZURE_ADA_SMALL_1536,
SOLAR_1_MINI
use_ocr
boolean
default: false

Whether or not to use OCR when processing files. Valid for PDFs, JPEGs, and PNGs. Useful for documents with tables, images, and/or scanned text.

generate_sparse_vectors
boolean
default: false

Whether or not to generate sparse vectors for the file. This is required for the file to be a candidate for hybrid search.

prepend_filename_to_chunks
boolean
default: false

Whether or not to prepend the file's name to chunks.

max_items_per_chunk
integer | null

Number of objects per chunk. For csv, tsv, xlsx, and json files only.

Required range: x > 0
parse_pdf_tables_with_ocr
boolean
default: false

Whether to use rich table parsing when use_ocr is enabled.

detect_audio_language
boolean
default: false

Whether to automatically detect the language of the uploaded audio file.

transcription_service
enum<string> | null

The transcription service to use for audio files. If no service is specified, 'deepgram' will be used.

Available options:
assemblyai,
deepgram
include_speaker_labels
boolean
default: false

Detect multiple speakers and label segments of speech by speaker for audio files.

media_type
enum<string> | null

The media type of the file. If not provided, it will be inferred from the file extension.

Available options:
TEXT,
IMAGE,
AUDIO,
VIDEO
split_rows
boolean
default: false

Whether to split tabular rows into chunks. Currently only valid for CSV, TSV, and XLSX files.

enable_cold_storage
boolean
default: false

Enable cold storage for the file. If set to true, the file will be moved to cold storage after a certain period of inactivity. Default is false.

hot_storage_time_to_live
integer | null

Time in days after which the file will be moved to cold storage. Must be one of [1, 3, 7, 14, 30].

generate_chunks_only
boolean
default: false

If this flag is enabled, the file will be chunked and stored with Carbon, but no embeddings will be generated. This overrides the skip_embedding_generation flag.

store_file_only
boolean
default: false

If this flag is enabled, the file will be stored with Carbon, but no processing will be done.

Body

multipart/form-data
file
file
required

Response

200 - application/json
id
integer
required
source
enum<string>
required
Available options:
GOOGLE_CLOUD_STORAGE,
GOOGLE_DRIVE,
NOTION,
NOTION_DATABASE,
INTERCOM,
DROPBOX,
ONEDRIVE,
SHAREPOINT,
CONFLUENCE,
BOX,
ZENDESK,
ZOTERO,
S3,
AZURE_BLOB_STORAGE,
GMAIL,
OUTLOOK,
SERVICENOW,
TEXT,
CSV,
TSV,
PDF,
DOCX,
PPTX,
XLSX,
XLSM,
MD,
RTF,
JSON,
HTML,
RAW_TEXT,
WEB_SCRAPE,
RSS_FEED,
FRESHDESK,
GITBOOK,
SALESFORCE,
GITHUB,
SLACK,
GURU,
GONG,
DOCUMENT360,
JPG,
PNG,
JPEG,
MP3,
MP2,
AAC,
WAV,
FLAC,
PCM,
M4A,
OGG,
OPUS,
MPEG,
MPG,
MP4,
WMV,
AVI,
MOV,
MKV,
FLV,
WEBM,
EML,
MSG
organization_id
integer
required
organization_user_id
integer | null
required
organization_supplied_user_id
string
required
external_file_id
string
required
sync_status
enum<string>
required
Available options:
DELAYED,
QUEUED_FOR_SYNC,
SYNCING,
READY,
SYNC_ERROR,
EVALUATING_RESYNC,
RATE_LIMITED,
SYNC_ABORTED,
QUEUED_FOR_OCR,
READY_TO_SYNC
skip_embedding_generation
boolean
required
supports_cold_storage
boolean
required
embedding_storage_status
enum<string>
required
Available options:
HOT_STORAGE,
HOT_TO_COLD,
COLD_STORAGE,
COLD_TO_HOT
created_at
string
required
updated_at
string
required
organization_user_data_source_id
integer | null
external_url
string | null
sync_error_message
string | null
last_sync
string | null
tags
object | null
file_statistics
object | null
file_metadata
object | null
embedding_properties
object | null
chunk_size
integer | null
chunk_overlap
integer | null
chunk_properties
object | null
ocr_properties
object
ocr_job_started_at
string | null
name
string | null
parent_id
integer | null
enable_auto_sync
boolean | null
presigned_url
string | null
parsed_text_url
string | null
additional_presigned_urls
object | null
source_created_at
string | null
generate_sparse_vectors
boolean | null
request_id
string | null
upload_id
string | null
sync_properties
object
messages_metadata
object
file_contents_deleted
boolean
default: false
hot_storage_time_to_live
integer | null