apps/notes/feishu/server.py

"""notes 多用途 sidecar：
  POST /transcribe — 用 ffmpeg 切片 + 串行调外部 ASR，绕过单请求大小限制
  POST /convert    — markdown-to-feishu，把会议纪要 push 飞书 docx
"""

import json
import logging
import os
import shutil
import subprocess
import tempfile
import uuid
from pathlib import Path
from typing import Optional

import requests
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel

logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s %(levelname)s %(name)s: %(message)s')
log = logging.getLogger('feishu')

app = FastAPI()


@app.get('/healthz')
def healthz():
    return {'ok': True}


class TranscribeReq(BaseModel):
    audio_path: str
    chunk_seconds: int = 60  # 60s ≈ 1-1.5 MB m4a，远低于 ASR 限制


@app.post('/transcribe')
def transcribe(req: TranscribeReq):
    """ffmpeg 切片 → 串行喂外部 ASR → 拼接 transcript。"""
    src = Path(req.audio_path)
    if not src.exists():
        raise HTTPException(400, f'audio not found: {src}')
    asr_url = os.environ.get('ASR_URL', '')
    asr_token = os.environ.get('ASR_TOKEN', '')
    if not asr_url or not asr_token:
        raise HTTPException(500, 'ASR_URL/ASR_TOKEN not configured in sidecar')

    tmp = Path(tempfile.gettempdir()) / f'transcribe-{uuid.uuid4().hex}'
    tmp.mkdir(parents=True)
    try:
        # 用 ffmpeg segment：直接 copy stream（fast & 不损失质量）
        # 个别情况下 -c copy 在某些容器格式下切不精准，回退 re-encode 到 aac
        ext = src.suffix.lstrip('.') or 'm4a'
        chunk_pattern = f'chunk_%03d.{ext}'
        try:
            subprocess.run(
                ['ffmpeg', '-y', '-i', str(src),
                 '-f', 'segment', '-segment_time', str(req.chunk_seconds),
                 '-c', 'copy', '-reset_timestamps', '1',
                 str(tmp / chunk_pattern)],
                check=True, capture_output=True, timeout=180,
            )
        except subprocess.CalledProcessError:
            # fallback: re-encode AAC，慢但稳
            log.warning("ffmpeg -c copy 失败，回退 re-encode")
            for p in tmp.glob(f'chunk_*.{ext}'):
                p.unlink(missing_ok=True)
            subprocess.run(
                ['ffmpeg', '-y', '-i', str(src),
                 '-f', 'segment', '-segment_time', str(req.chunk_seconds),
                 '-c:a', 'aac', '-b:a', '64k', '-ac', '1', '-ar', '16000',
                 '-reset_timestamps', '1',
                 str(tmp / 'chunk_%03d.m4a')],
                check=True, capture_output=True, timeout=600,
            )
            ext = 'm4a'

        chunks = sorted(tmp.glob(f'chunk_*.{ext}'))
        if not chunks:
            raise HTTPException(500, 'ffmpeg produced 0 chunks')
        log.info("split %s → %d chunks", src.name, len(chunks))

        all_text = []
        for i, c in enumerate(chunks, 1):
            log.info("ASR chunk %d/%d (%s, %d KB)", i, len(chunks), c.name, c.stat().st_size // 1024)
            with open(c, 'rb') as f:
                r = requests.post(
                    asr_url,
                    headers={'Authorization': f'Bearer {asr_token}'},
                    files={'file': (c.name, f, 'audio/mp4')},
                    data={'model': 'qwen3-asr', 'response_format': 'json'},
                    timeout=300,
                )
            if not r.ok:
                raise HTTPException(502, f'ASR chunk {i} {r.status_code}: {r.text[:300]}')
            try:
                text = r.json().get('text', '').strip()
            except Exception:
                raise HTTPException(502, f'ASR chunk {i} bad json: {r.text[:200]}')
            all_text.append(text)
        full = '\n'.join(t for t in all_text if t)
        return {'text': full, 'chunks': len(chunks)}
    finally:
        shutil.rmtree(tmp, ignore_errors=True)


class ConvertReq(BaseModel):
    md_path: str
    title: Optional[str] = None
    existing_doc_id: Optional[str] = None


@app.post('/convert')
def convert(req: ConvertReq):
    md = Path(req.md_path)
    if not md.exists():
        raise HTTPException(400, f'md not found: {md}')

    cmd = ['/usr/local/bin/markdown-to-feishu', str(md), '--as', 'user']
    if req.existing_doc_id:
        cmd += ['--update', req.existing_doc_id]
    if req.title:
        cmd += ['--title', req.title]
    log.info("run: %s", ' '.join(cmd))

    env = os.environ.copy()
    # markdown-to-feishu state file 放 PVC，重启不丢
    env['MD2FEISHU_STATE_DIR'] = '/data/feishu-state'
    Path('/data/feishu-state').mkdir(parents=True, exist_ok=True)

    try:
        proc = subprocess.run(
            cmd, capture_output=True, text=True, timeout=600, env=env,
            cwd=str(md.parent),
        )
    except subprocess.TimeoutExpired:
        raise HTTPException(504, 'markdown-to-feishu timeout (>10min)')

    # exit code 2 = embeds 有失败，但 doc 创建成功，仍 parse stdout
    if proc.returncode not in (0, 2):
        log.warning("md2feishu exit=%d stderr=%s", proc.returncode, proc.stderr[-500:])
        raise HTTPException(502, f'md2feishu exit {proc.returncode}: '
                                 f'{proc.stderr.strip()[-400:]}')

    # 取 stdout 里最后一段 JSON 对象（script 的 final print）
    out = proc.stdout.strip()
    # 从后往前找第一个 '{'，取到末尾
    last_open = out.rfind('{')
    if last_open < 0:
        raise HTTPException(502, f'md2feishu no json output. stdout tail: {out[-400:]}')
    try:
        data = json.loads(out[last_open:])
    except json.JSONDecodeError as e:
        raise HTTPException(502, f'md2feishu json parse: {e}; tail: {out[-400:]}')

    doc_id = data.get('doc_id')
    url = data.get('url')
    if not doc_id or not url:
        raise HTTPException(502, f'md2feishu missing doc_id/url: {data}')
    log.info("ok: doc_id=%s url=%s embeds=%s",
             doc_id, url, data.get('embeds_inserted'))
    return {
        'doc_id': doc_id,
        'url': url,
        'embeds_inserted': data.get('embeds_inserted', 0),
        'embeds_failed': data.get('embeds_failed', 0),
    }
-												notes(asr): 切片串行 ASR 绕单文件大小限制

ASR server 直接 500 拒绝大文件 (15MB / ~15min 4.7s 即返回 500)，不是
处理超时。改成：sidecar 装 ffmpeg → /transcribe endpoint 把音频切 60s
段 → 串行调外部 ASR → 拼接 transcript。notes 主容器 call_asr 改成 POST
到 sidecar /transcribe（timeout 1h 给长录音留余地）。

- feishu sidecar Dockerfile + ffmpeg + requests
- server.py 加 TranscribeReq；fallback -c copy 失败时 re-encode AAC
- main.rs 删除 asr_url/asr_token 字段（now sidecar concern）
- k8s manifest: ASR_URL/ASR_TOKEN 从主容器移到 feishu sidecar env

											
										
										
											2026-05-17 22:38:05 +01:00
+								"""notes 多用途 sidecar：
 								  POST /transcribe — 用 ffmpeg 切片 + 串行调外部 ASR，绕过单请求大小限制
 								  POST /convert    — markdown-to-feishu，把会议纪要 push 飞书 docx
-												notes: 加一键转飞书文档 (sidecar markdown-to-feishu)

- backend: POST /api/recordings/:id/feishu → 拼 markdown (总结在最上 + 附件链接到转录/录音 + 转写全文) → 写 /data/feishu-tmp/<id>/ → HTTP POST 到 feishu sidecar
- 复用：已有 feishu_doc_id 时 --update 同一个 doc，前端按钮文案变「↻ 重新生成」
- schema 加 feishu_doc_id + feishu_url 两列（ALTER TABLE 兼容旧 db）
- LLM prompt 改：行动项用 markdown checkbox `- [ ] 谁·做什么·何时`
- sidecar apps/notes/feishu: node:20 + python3 + python3-markdown + @larksuite/cli + COPY 自己的 markdown-to-feishu script + FastAPI /convert
- k8s: deployment 加 feishu container 共享 PVC；lark-cli-creds Secret 挂 /root/.lark-cli/config.json
- CI: 主 image --no-cache（cube 规矩），sidecar 保留 layer cache（chromium-free，但 apt/npm 也大）
- 前端: content 头部加「📤 一键转飞书文档」按钮；已转过显示飞书链接 + 按钮变重生成

											
										
										
											2026-05-17 22:16:13 +01:00
+								"""
 								import json
 								import logging
 								import os
-												notes(asr): 切片串行 ASR 绕单文件大小限制

ASR server 直接 500 拒绝大文件 (15MB / ~15min 4.7s 即返回 500)，不是
处理超时。改成：sidecar 装 ffmpeg → /transcribe endpoint 把音频切 60s
段 → 串行调外部 ASR → 拼接 transcript。notes 主容器 call_asr 改成 POST
到 sidecar /transcribe（timeout 1h 给长录音留余地）。

- feishu sidecar Dockerfile + ffmpeg + requests
- server.py 加 TranscribeReq；fallback -c copy 失败时 re-encode AAC
- main.rs 删除 asr_url/asr_token 字段（now sidecar concern）
- k8s manifest: ASR_URL/ASR_TOKEN 从主容器移到 feishu sidecar env

											
										
										
											2026-05-17 22:38:05 +01:00
+								import shutil
-												notes: 加一键转飞书文档 (sidecar markdown-to-feishu)

- backend: POST /api/recordings/:id/feishu → 拼 markdown (总结在最上 + 附件链接到转录/录音 + 转写全文) → 写 /data/feishu-tmp/<id>/ → HTTP POST 到 feishu sidecar
- 复用：已有 feishu_doc_id 时 --update 同一个 doc，前端按钮文案变「↻ 重新生成」
- schema 加 feishu_doc_id + feishu_url 两列（ALTER TABLE 兼容旧 db）
- LLM prompt 改：行动项用 markdown checkbox `- [ ] 谁·做什么·何时`
- sidecar apps/notes/feishu: node:20 + python3 + python3-markdown + @larksuite/cli + COPY 自己的 markdown-to-feishu script + FastAPI /convert
- k8s: deployment 加 feishu container 共享 PVC；lark-cli-creds Secret 挂 /root/.lark-cli/config.json
- CI: 主 image --no-cache（cube 规矩），sidecar 保留 layer cache（chromium-free，但 apt/npm 也大）
- 前端: content 头部加「📤 一键转飞书文档」按钮；已转过显示飞书链接 + 按钮变重生成

											
										
										
											2026-05-17 22:16:13 +01:00
+								import subprocess
-												notes(asr): 切片串行 ASR 绕单文件大小限制

ASR server 直接 500 拒绝大文件 (15MB / ~15min 4.7s 即返回 500)，不是
处理超时。改成：sidecar 装 ffmpeg → /transcribe endpoint 把音频切 60s
段 → 串行调外部 ASR → 拼接 transcript。notes 主容器 call_asr 改成 POST
到 sidecar /transcribe（timeout 1h 给长录音留余地）。

- feishu sidecar Dockerfile + ffmpeg + requests
- server.py 加 TranscribeReq；fallback -c copy 失败时 re-encode AAC
- main.rs 删除 asr_url/asr_token 字段（now sidecar concern）
- k8s manifest: ASR_URL/ASR_TOKEN 从主容器移到 feishu sidecar env

											
										
										
											2026-05-17 22:38:05 +01:00
+								import tempfile
 								import uuid
-												notes: 加一键转飞书文档 (sidecar markdown-to-feishu)

- backend: POST /api/recordings/:id/feishu → 拼 markdown (总结在最上 + 附件链接到转录/录音 + 转写全文) → 写 /data/feishu-tmp/<id>/ → HTTP POST 到 feishu sidecar
- 复用：已有 feishu_doc_id 时 --update 同一个 doc，前端按钮文案变「↻ 重新生成」
- schema 加 feishu_doc_id + feishu_url 两列（ALTER TABLE 兼容旧 db）
- LLM prompt 改：行动项用 markdown checkbox `- [ ] 谁·做什么·何时`
- sidecar apps/notes/feishu: node:20 + python3 + python3-markdown + @larksuite/cli + COPY 自己的 markdown-to-feishu script + FastAPI /convert
- k8s: deployment 加 feishu container 共享 PVC；lark-cli-creds Secret 挂 /root/.lark-cli/config.json
- CI: 主 image --no-cache（cube 规矩），sidecar 保留 layer cache（chromium-free，但 apt/npm 也大）
- 前端: content 头部加「📤 一键转飞书文档」按钮；已转过显示飞书链接 + 按钮变重生成

											
										
										
											2026-05-17 22:16:13 +01:00
+								from pathlib import Path
 								from typing import Optional
-												notes(asr): 切片串行 ASR 绕单文件大小限制

ASR server 直接 500 拒绝大文件 (15MB / ~15min 4.7s 即返回 500)，不是
处理超时。改成：sidecar 装 ffmpeg → /transcribe endpoint 把音频切 60s
段 → 串行调外部 ASR → 拼接 transcript。notes 主容器 call_asr 改成 POST
到 sidecar /transcribe（timeout 1h 给长录音留余地）。

- feishu sidecar Dockerfile + ffmpeg + requests
- server.py 加 TranscribeReq；fallback -c copy 失败时 re-encode AAC
- main.rs 删除 asr_url/asr_token 字段（now sidecar concern）
- k8s manifest: ASR_URL/ASR_TOKEN 从主容器移到 feishu sidecar env

											
										
										
											2026-05-17 22:38:05 +01:00
+								import requests
-												notes: 加一键转飞书文档 (sidecar markdown-to-feishu)

- backend: POST /api/recordings/:id/feishu → 拼 markdown (总结在最上 + 附件链接到转录/录音 + 转写全文) → 写 /data/feishu-tmp/<id>/ → HTTP POST 到 feishu sidecar
- 复用：已有 feishu_doc_id 时 --update 同一个 doc，前端按钮文案变「↻ 重新生成」
- schema 加 feishu_doc_id + feishu_url 两列（ALTER TABLE 兼容旧 db）
- LLM prompt 改：行动项用 markdown checkbox `- [ ] 谁·做什么·何时`
- sidecar apps/notes/feishu: node:20 + python3 + python3-markdown + @larksuite/cli + COPY 自己的 markdown-to-feishu script + FastAPI /convert
- k8s: deployment 加 feishu container 共享 PVC；lark-cli-creds Secret 挂 /root/.lark-cli/config.json
- CI: 主 image --no-cache（cube 规矩），sidecar 保留 layer cache（chromium-free，但 apt/npm 也大）
- 前端: content 头部加「📤 一键转飞书文档」按钮；已转过显示飞书链接 + 按钮变重生成

											
										
										
											2026-05-17 22:16:13 +01:00
+								from fastapi import FastAPI, HTTPException
 								from pydantic import BaseModel
 								logging.basicConfig(level=logging.INFO,
 								                    format='%(asctime)s %(levelname)s %(name)s: %(message)s')
 								log = logging.getLogger('feishu')
 								app = FastAPI()
 								@app.get('/healthz')
 								def healthz():
 								    return {'ok': True}
-												notes(asr): 切片串行 ASR 绕单文件大小限制

ASR server 直接 500 拒绝大文件 (15MB / ~15min 4.7s 即返回 500)，不是
处理超时。改成：sidecar 装 ffmpeg → /transcribe endpoint 把音频切 60s
段 → 串行调外部 ASR → 拼接 transcript。notes 主容器 call_asr 改成 POST
到 sidecar /transcribe（timeout 1h 给长录音留余地）。

- feishu sidecar Dockerfile + ffmpeg + requests
- server.py 加 TranscribeReq；fallback -c copy 失败时 re-encode AAC
- main.rs 删除 asr_url/asr_token 字段（now sidecar concern）
- k8s manifest: ASR_URL/ASR_TOKEN 从主容器移到 feishu sidecar env

											
										
										
											2026-05-17 22:38:05 +01:00
+								class TranscribeReq(BaseModel):
 								    audio_path: str
 								    chunk_seconds: int = 60  # 60s ≈ 1-1.5 MB m4a，远低于 ASR 限制
 								@app.post('/transcribe')
 								def transcribe(req: TranscribeReq):
 								    """ffmpeg 切片 → 串行喂外部 ASR → 拼接 transcript。"""
 								    src = Path(req.audio_path)
 								    if not src.exists():
 								        raise HTTPException(400, f'audio not found: {src}')
 								    asr_url = os.environ.get('ASR_URL', '')
 								    asr_token = os.environ.get('ASR_TOKEN', '')
 								    if not asr_url or not asr_token:
 								        raise HTTPException(500, 'ASR_URL/ASR_TOKEN not configured in sidecar')
 								    tmp = Path(tempfile.gettempdir()) / f'transcribe-{uuid.uuid4().hex}'
 								    tmp.mkdir(parents=True)
 								    try:
 								        # 用 ffmpeg segment：直接 copy stream（fast & 不损失质量）
 								        # 个别情况下 -c copy 在某些容器格式下切不精准，回退 re-encode 到 aac
 								        ext = src.suffix.lstrip('.') or 'm4a'
 								        chunk_pattern = f'chunk_%03d.{ext}'
 								        try:
 								            subprocess.run(
 								                ['ffmpeg', '-y', '-i', str(src),
 								                 '-f', 'segment', '-segment_time', str(req.chunk_seconds),
 								                 '-c', 'copy', '-reset_timestamps', '1',
 								                 str(tmp / chunk_pattern)],
 								                check=True, capture_output=True, timeout=180,
 								            )
 								        except subprocess.CalledProcessError:
 								            # fallback: re-encode AAC，慢但稳
 								            log.warning("ffmpeg -c copy 失败，回退 re-encode")
 								            for p in tmp.glob(f'chunk_*.{ext}'):
 								                p.unlink(missing_ok=True)
 								            subprocess.run(
 								                ['ffmpeg', '-y', '-i', str(src),
 								                 '-f', 'segment', '-segment_time', str(req.chunk_seconds),
 								                 '-c:a', 'aac', '-b:a', '64k', '-ac', '1', '-ar', '16000',
 								                 '-reset_timestamps', '1',
 								                 str(tmp / 'chunk_%03d.m4a')],
 								                check=True, capture_output=True, timeout=600,
 								            )
 								            ext = 'm4a'
 								        chunks = sorted(tmp.glob(f'chunk_*.{ext}'))
 								        if not chunks:
 								            raise HTTPException(500, 'ffmpeg produced 0 chunks')
 								        log.info("split %s → %d chunks", src.name, len(chunks))
 								        all_text = []
 								        for i, c in enumerate(chunks, 1):
 								            log.info("ASR chunk %d/%d (%s, %d KB)", i, len(chunks), c.name, c.stat().st_size // 1024)
 								            with open(c, 'rb') as f:
 								                r = requests.post(
 								                    asr_url,
 								                    headers={'Authorization': f'Bearer {asr_token}'},
 								                    files={'file': (c.name, f, 'audio/mp4')},
 								                    data={'model': 'qwen3-asr', 'response_format': 'json'},
 								                    timeout=300,
 								                )
 								            if not r.ok:
 								                raise HTTPException(502, f'ASR chunk {i} {r.status_code}: {r.text[:300]}')
 								            try:
 								                text = r.json().get('text', '').strip()
 								            except Exception:
 								                raise HTTPException(502, f'ASR chunk {i} bad json: {r.text[:200]}')
 								            all_text.append(text)
 								        full = '\n'.join(t for t in all_text if t)
 								        return {'text': full, 'chunks': len(chunks)}
 								    finally:
 								        shutil.rmtree(tmp, ignore_errors=True)
-												notes: 加一键转飞书文档 (sidecar markdown-to-feishu)

- backend: POST /api/recordings/:id/feishu → 拼 markdown (总结在最上 + 附件链接到转录/录音 + 转写全文) → 写 /data/feishu-tmp/<id>/ → HTTP POST 到 feishu sidecar
- 复用：已有 feishu_doc_id 时 --update 同一个 doc，前端按钮文案变「↻ 重新生成」
- schema 加 feishu_doc_id + feishu_url 两列（ALTER TABLE 兼容旧 db）
- LLM prompt 改：行动项用 markdown checkbox `- [ ] 谁·做什么·何时`
- sidecar apps/notes/feishu: node:20 + python3 + python3-markdown + @larksuite/cli + COPY 自己的 markdown-to-feishu script + FastAPI /convert
- k8s: deployment 加 feishu container 共享 PVC；lark-cli-creds Secret 挂 /root/.lark-cli/config.json
- CI: 主 image --no-cache（cube 规矩），sidecar 保留 layer cache（chromium-free，但 apt/npm 也大）
- 前端: content 头部加「📤 一键转飞书文档」按钮；已转过显示飞书链接 + 按钮变重生成

											
										
										
											2026-05-17 22:16:13 +01:00
+								class ConvertReq(BaseModel):
 								    md_path: str
 								    title: Optional[str] = None
 								    existing_doc_id: Optional[str] = None
 								@app.post('/convert')
 								def convert(req: ConvertReq):
 								    md = Path(req.md_path)
 								    if not md.exists():
 								        raise HTTPException(400, f'md not found: {md}')
 								    cmd = ['/usr/local/bin/markdown-to-feishu', str(md), '--as', 'user']
 								    if req.existing_doc_id:
 								        cmd += ['--update', req.existing_doc_id]
 								    if req.title:
 								        cmd += ['--title', req.title]
 								    log.info("run: %s", ' '.join(cmd))
 								    env = os.environ.copy()
 								    # markdown-to-feishu state file 放 PVC，重启不丢
 								    env['MD2FEISHU_STATE_DIR'] = '/data/feishu-state'
 								    Path('/data/feishu-state').mkdir(parents=True, exist_ok=True)
 								    try:
 								        proc = subprocess.run(
 								            cmd, capture_output=True, text=True, timeout=600, env=env,
 								            cwd=str(md.parent),
 								        )
 								    except subprocess.TimeoutExpired:
 								        raise HTTPException(504, 'markdown-to-feishu timeout (>10min)')
 								    # exit code 2 = embeds 有失败，但 doc 创建成功，仍 parse stdout
 								    if proc.returncode not in (0, 2):
 								        log.warning("md2feishu exit=%d stderr=%s", proc.returncode, proc.stderr[-500:])
 								        raise HTTPException(502, f'md2feishu exit {proc.returncode}: '
 								                                 f'{proc.stderr.strip()[-400:]}')
 								    # 取 stdout 里最后一段 JSON 对象（script 的 final print）
 								    out = proc.stdout.strip()
 								    # 从后往前找第一个 '{'，取到末尾
 								    last_open = out.rfind('{')
 								    if last_open < 0:
 								        raise HTTPException(502, f'md2feishu no json output. stdout tail: {out[-400:]}')
 								    try:
 								        data = json.loads(out[last_open:])
 								    except json.JSONDecodeError as e:
 								        raise HTTPException(502, f'md2feishu json parse: {e}; tail: {out[-400:]}')
 								    doc_id = data.get('doc_id')
 								    url = data.get('url')
 								    if not doc_id or not url:
 								        raise HTTPException(502, f'md2feishu missing doc_id/url: {data}')
 								    log.info("ok: doc_id=%s url=%s embeds=%s",
 								             doc_id, url, data.get('embeds_inserted'))
 								    return {
 								        'doc_id': doc_id,
 								        'url': url,
 								        'embeds_inserted': data.get('embeds_inserted', 0),
 								        'embeds_failed': data.get('embeds_failed', 0),
 								    }