Hello guys,
I'm trying to set up Document AI, but I get the following error:
raise exceptions.from_grpc_error(exc) from exc
google.api_core.exceptions.Unknown: None Stream removed
This is my code:
from google.cloud import documentai, storage
from google.api_core.client_options import ClientOptions
from my_module import join_blob_paths
def parse_pdf(
config: Config,
source_blob: storage.Blob,
destination_blob: storage.Blob,
parsing_timeout: int
) -> list[str]:
project = config.PROJECT
location = config.REGION
processor_id = config.PROCESSOR_ID
input_mime_type = "application/pdf"
gcs_source_uri = f"gs://{join_blob_paths(source_blob.bucket.name, source_blob.name)}"
gcs_destination_uri = f"gs://{join_blob_paths(destination_blob.bucket.name, destination_blob.name)}"
# initialize the client for Document AI
opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")
client = documentai.DocumentProcessorServiceClient(client_options=opts)
# configure the source bucket location
if not gcs_source_uri.endswith("/") and "." in gcs_source_uri:
# specific GCS URIs to process individual documents
gcs_document = documentai.GcsDocument(
gcs_uri=gcs_source_uri, mime_type=input_mime_type
)
# loading GCS Input URI into a list of document files
gcs_documents = documentai.GcsDocuments(documents=[gcs_document])
input_config = documentai.BatchDocumentsInputConfig(gcs_documents=gcs_documents)
else:
# GCS URI Prefix to process an entire directory
gcs_prefix = documentai.GcsPrefix(gcs_uri_prefix=gcs_source_uri)
input_config = documentai.BatchDocumentsInputConfig(gcs_prefix=gcs_prefix)
# configure Cloud Storage URI for the Output Directory
gcs_output_config = documentai.DocumentOutputConfig.GcsOutputConfig(
gcs_uri=gcs_destination_uri
)
# set the location where to write results
output_config = documentai.DocumentOutputConfig(gcs_output_config=gcs_output_config)
# set the full resource name of the processor
name = client.processor_path(project, location, processor_id)
# process request
request = documentai.BatchProcessRequest(
name=name,
input_documents=input_config,
document_output_config=output_config,
)
# set the client to asynchronously process the PDF
operation = client.batch_process_documents(request)
# wait for the operation to complete
try:
logger.info(f"Waiting for the operation {operation.operation.name} to finish")
result = operation.result(timeout=parsing_timeout)
logger.info(f"Results saved in {gcs_destination_uri}")
except (RetryError, InternalServerError) as e:
logger.error(e.message)
return result
To troubleshoot the google.api_core.exceptions.Unknown error with Document AI:
Can you please check the provided code and let me know if everything is alright in terms of setting it up?
User | Count |
---|---|
14 | |
1 | |
1 | |
1 | |
1 |