ElevenLabs + React를 이용해 TTS(Text To Speech)를 구현해보기

카테고리 없음

ElevenLabs + React를 이용해 TTS(Text To Speech)를 구현해보기

곽빵 2024. 12. 7. 17:27

개요

ElevenLabs를 사용해 텍스트를 AI음성으로 변환하는 코드를 적어보았다.

ElevenLabs란?

AI를 활용하여 텍스트를 실제 사람의 목소리처럼 들리게 변환하는 TTS(Text To Speech) 소프트웨어를 제공한다.

이를 통해 오디오북, 비디오 내레이션, 팟캐스트등 다양한 콘텐츠에서 고품질의 음성 출력을 생성할 수 있으며 또한 사용자는 자신의 목소리를 업로드하여 AI가 이를 학습하고 유사한 음성을 생성할 수 있는 음성 복제 기능도 제공한다.

(참고로 무료로 10000자의 음성변환을 제공하니 간단하게 가입해서 테스트를 해볼 수 있다.)

이하의 가이드를 보면 이용할 수 있는 API나 모델, 음성 옵션등에 대한 문서가 잘 작성되어있어 참고가 되었다.

https://elevenlabs.io/docs/product/introduction

Developer Guides and API Reference Introduction | ElevenLabs Docs

Text to Speech (Speech Synthesis) Our Text-to-Speech technology, also known as Speech Synthesis, is the core of ElevenLabs. It serves as the foundation for many of the features we offer and powers many services worldwide. This technology transforms text in

elevenlabs.io

이번에 활용한 API는 이하의 스트림 TTS이다. 변환할 텍스트가 장문이든 단문이든에 상관없이 응답을 가장 빨리 받을 수 있어서 이 친구를 선택했다.

https://elevenlabs.io/docs/api-reference/streaming

Text to Speech Stream - ElevenLabs

Output format of the generated audio. Must be one of: mp3_22050_32 - output format, mp3 with 22.05kHz sample rate at 32kbps. mp3_44100_32 - output format, mp3 with 44.1kHz sample rate at 32kbps. mp3_44100_64 - output format, mp3 with 44.1kHz sample rate at

elevenlabs.io

그럼 이제 useTextToSpeech라는 훅을 작성해보고자 한다.

옵션으로 받는 interface와 사용할 state

interface UseTextToSpeechOptions {
  modelId: string;          // 사용할 모델 ID
  voiceId: string;         // ElevenLabs의 음성 ID
  stability?: number;       // 음성 안정성 (0~1)
  similarityBoost?: number; // 원본 음성과의 유사도 (0~1)
  styleExaggeration?: number; // 스타일 과장도
  useSpeakerBoost?: boolean; // 스피커 부스트 사용 여부
  playbackRate?: number;    // 재생 속도
  onStart?: () => void;     // 재생 시작 콜백
  onEnd?: () => void;       // 재생 종료 콜백
  onError?: (error: Error) => void; // 에러 처리 콜백
}

export function useTextToSpeech(options: UseTextToSpeechOptions) {
  const [isPlaying, setIsPlaying] = useState(false);
  const [audioElement, setAudioElement] = useState<HTMLAudioElement | null>(null);

isPlaying: 현재 재생 상태
audioElement: 현재 재생 중인 오디오 엘리먼트

재생 중지 함수

const stopSpeech = useCallback(() => {
  if (audioElement) {
    audioElement.pause();
    setIsPlaying(false);
    options.onEnd?.();
  }
}, [audioElement, options]);

재생 함수(핵심)

const speakText = useCallback(
  async (text: string) => {
    // 이미 재생 중이면 중지
    if (isPlaying) {
      stopSpeech();
      return;
    }

    try {
      setIsPlaying(true);
      options.onStart?.();

      // ElevenLabs API 호출
      const response = await fetch(
        `${process.env.NEXT_PUBLIC_ELEVENLABS_API_ENDPOINT}text-to-speech/${options.voiceId}/stream`,
        {
          method: 'POST',
          headers: {
            'Content-Type': 'application/json',
            'xi-api-key': process.env.NEXT_PUBLIC_ELEVENLABS_API_KEY || '',
          },
          body: JSON.stringify({
            text,
            voice_settings: {
              stability: options.stability || 0.5,
              similarity_boost: options.similarityBoost || 0.75,
              style: options.styleExaggeration || 0,
              use_speaker_boost: options.useSpeakerBoost || false,
            },
            model_id: options.modelId,
          }),
        },
      );

      // 스트리밍 처리 설정
      const reader = response.body?.getReader();
      const mediaSource = new MediaSource();
      const audio = new Audio(URL.createObjectURL(mediaSource));
      setAudioElement(audio);

      // MediaSource 이벤트 처리
      mediaSource.addEventListener('sourceopen', async () => {
        const sourceBuffer = mediaSource.addSourceBuffer('audio/mpeg');
        const chunks: Uint8Array[] = [];

        // 청크 단위로 데이터 처리
        while (true) {
          const { done, value } = await reader.read();
          if (done) break;

          chunks.push(value);
          
          // sourceBuffer 업데이트 중이면 대기
          if (sourceBuffer.updating) {
            await new Promise(resolve => {
              sourceBuffer.addEventListener('updateend', resolve, { once: true });
            });
          }
          sourceBuffer.appendBuffer(value);

          // 첫 번째 청크가 도착하면 재생 시작
          if (chunks.length === 1) {
            audio.play().catch(error => {
              options.onError?.(new Error('Audio playback failed: ' + error.message));
            });
          }
        }

        mediaSource.endOfStream();
      });

      // 오디오 이벤트 핸들러
      audio.onended = () => {
        setIsPlaying(false);
        setAudioElement(null);
        options.onEnd?.();
      };

      audio.onerror = () => {
        setIsPlaying(false);
        options.onError?.(new Error('Audio playback failed'));
      };

    } catch (error) {
      setIsPlaying(false);
      options.onError?.(error instanceof Error ? error : new Error('Failed to generate speech'));
    }
  },
  [isPlaying, options, stopSpeech],
);

text-to-speech/stream API는 Streams API라는 Web API를 이용해 데이터를 송신해 주기 때문에 reponse.body.reader()를 이용해 청크단위로 데이터를 읽을 수 있다.

사용예시

const { isPlaying, speakText, stopSpeech } = useTextToSpeech({
  modelId: 'eleven_multilingual_v2',
  voiceId: 'voice-id',
  stability: 0.5,
  similarityBoost: 0.75,
  onStart: () => console.log('재생 시작'),
  onEnd: () => console.log('재생 종료'),
});

// 사용
<button onClick={() => speakText('안녕하세요')}>
  {isPlaying ? '정지' : '재생'}
</button>

전체코드

import { useCallback, useState } from 'react';

// ElevenLabs API 문서: https://elevenlabs.io/docs/developer-guides/
interface UseTextToSpeechOptions {
  modelId: string;          // 사용할 모델 ID
  voiceId: string;         // ElevenLabs의 음성 ID
  stability?: number;       // 음성 안정성 (0~1)
  similarityBoost?: number; // 원본 음성과의 유사도 (0~1)
  styleExaggeration?: number; // 스타일 과장도
  useSpeakerBoost?: boolean; // 스피커 부스트 사용 여부
  playbackRate?: number;    // 재생 속도
  onStart?: () => void;     // 재생 시작 콜백
  onEnd?: () => void;       // 재생 종료 콜백
  onError?: (error: Error) => void; // 에러 처리 콜백
}

export function useTextToSpeech(options: UseTextToSpeechOptions) {
  // 상태 관리
  const [isPlaying, setIsPlaying] = useState(false);
  const [audioElement, setAudioElement] = useState<HTMLAudioElement | null>(null);

  // 재생 중지 함수
  const stopSpeech = useCallback(() => {
    if (audioElement) {
      audioElement.pause();
      setIsPlaying(false);
      options.onEnd?.();
    }
  }, [audioElement, options]);

  // 텍스트를 음성으로 변환하고 재생하는 핵심 함수
  const speakText = useCallback(
    async (text: string) => {
      // 이미 재생 중이면 중지
      if (isPlaying) {
        stopSpeech();
        return;
      }

      try {
        setIsPlaying(true);
        options.onStart?.();

        // ElevenLabs API 호출
        const response = await fetch(
          `${process.env.NEXT_PUBLIC_ELEVENLABS_API_ENDPOINT}text-to-speech/${options.voiceId}/stream`,
          {
            method: 'POST',
            headers: {
              'Content-Type': 'application/json',
              'xi-api-key': process.env.NEXT_PUBLIC_ELEVENLABS_API_KEY || '',
            },
            body: JSON.stringify({
              text,
              voice_settings: {
                stability: options.stability || 0.5,
                similarity_boost: options.similarityBoost || 0.75,
                style: options.styleExaggeration || 0,
                use_speaker_boost: options.useSpeakerBoost || false,
              },
              model_id: options.modelId,
            }),
          },
        );

        if (!response.ok) {
          throw new Error(`HTTP error! status: ${response.status}`);
        }

        // 스트림 리더 생성
        const reader = response.body?.getReader();
        if (!reader) throw new Error('스트림 리더를 생성할 수 없습니다');

        // MediaSource 설정
        const mediaSource = new MediaSource();
        const audio = new Audio(URL.createObjectURL(mediaSource));
        setAudioElement(audio);

        // MediaSource 이벤트 처리
        mediaSource.addEventListener('sourceopen', async () => {
          const sourceBuffer = mediaSource.addSourceBuffer('audio/mpeg');
          const chunks: Uint8Array[] = [];

          // SourceBuffer의 업데이트 완료를 기다리는 헬퍼 함수
          const waitForUpdateEnd = () => {
            return new Promise<void>((resolve) => {
              if (!sourceBuffer.updating) {
                resolve();
              } else {
                sourceBuffer.addEventListener('updateend', () => resolve(), { once: true });
              }
            });
          };

          try {
            // 청크 단위로 데이터를 받아서 처리
            while (true) {
              const { done, value } = await reader.read();
              if (done) break;

              chunks.push(value);
              
              // 이전 업데이트가 완료될 때까지 대기
              if (sourceBuffer.updating) {
                await waitForUpdateEnd();
              }
              sourceBuffer.appendBuffer(value);

              // 첫 번째 청크가 도착하면 재생 시작
              if (chunks.length === 1) {
                audio.play().catch(error => {
                  options.onError?.(new Error('오디오 재생 실패: ' + error.message));
                });
              }
            }

            // 모든 업데이트가 완료될 때까지 대기 후 스트림 종료
            await waitForUpdateEnd();
            if (!sourceBuffer.updating) {
              mediaSource.endOfStream();
            }

          } catch (error) {
            console.error('스트리밍 에러:', error);
            options.onError?.(error instanceof Error ? error : new Error('스트리밍 실패'));
          }
        });

        // 오디오 이벤트 핸들러 설정
        audio.onended = () => {
          setIsPlaying(false);
          setAudioElement(null);
          options.onEnd?.();
        };

        audio.onerror = () => {
          setIsPlaying(false);
          options.onError?.(new Error('오디오 재생 실패'));
        };

      } catch (error) {
        setIsPlaying(false);
        options.onError?.(error instanceof Error ? error : new Error('음성 생성 실패'));
      }
    },
    [isPlaying, options, stopSpeech],
  );

  // 훅에서 반환하는 값들
  return {
    isPlaying,    // 현재 재생 상태
    speakText,    // 텍스트를 음성으로 변환하여 재생하는 함수
    stopSpeech,   // 재생을 중지하는 함수
  };
}