import requests
import json
def extract_specific_fields(workspace_id, task_id, app_id, secret_code):
"""抽取特定字段"""
host = "https://docflow.textin.com"
url = "/api/app-api/sip/platform/v2/file/extract_fields"
# 请求体
payload = {
"workspace_id": workspace_id,
"task_id": task_id,
"fields": [
{
"key": "发票代码",
"prompt": "提取完整的发票代码"
},
{
"key": "开票日期",
"prompt": "只保留年的部分"
}
],
"tables": [
{
"name": "Table1",
"fields": [
{
"key": "货物名称",
"prompt": "提取商品全称"
},
{
"key": "单价"
}
]
}
]
}
resp = requests.post(
f"{host}{url}",
json=payload,
headers={
"x-ti-app-id": app_id,
"x-ti-secret-code": secret_code,
"Content-Type": "application/json"
},
timeout=60,
)
if resp.status_code != 200:
print(f"请求失败: {resp.status_code}")
print(f"错误信息: {resp.text}")
return None
data = resp.json()
if data.get("code") != 200:
print(f"接口返回错误: {data.get('message')}")
return None
# 处理返回结果
result = data.get("result", {})
files = result.get("files", [])
for file in files:
print(f"文件名: {file.get('name')}")
print(f"任务ID: {file.get('task_id')}")
# 提取字段信息
file_data = file.get("data", {})
fields = file_data.get("fields", [])
if fields:
print("\n=== 字段信息 ===")
for field in fields:
key = field.get("key", "")
value = field.get("value", "")
positions = field.get("position", [])
print(f"字段: {key}")
print(f"值: {value}")
# 显示位置信息
for i, pos in enumerate(positions):
page = pos.get("page", 0)
vertices = pos.get("vertices", [])
print(f" 位置 {i+1} (第{page+1}页): {vertices}")
print("-" * 30)
# 提取表格信息
tables = file_data.get("tables", [])
if tables:
print("\n=== 表格信息 ===")
for table in tables:
table_name = table.get("tableName", "")
print(f"表格名称: {table_name}")
items = table.get("items", [])
for row_idx, row in enumerate(items):
print(f" 第 {row_idx + 1} 行:")
for cell in row:
print(f" {cell.get('key')}: {cell.get('value')}")
return data
# 使用示例
if __name__ == "__main__":
workspace_id = "<your-workspace-id>"
task_id = "<your-task-id>"
app_id = "<your-app-id>"
secret_code = "<your-secret-code>"
result = extract_specific_fields(workspace_id, task_id, app_id, secret_code)