Projekt for SPIE - Avatar for safety briefing / managment event
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

189 lines
5.8 KiB

// Georgy Treshchev 2025.
#include "RuntimeVisemeGenerator.h"
#include "Misc/EngineVersionComparison.h"
#include "Features/IModularFeatures.h"
#include "RuntimeMetaHumanLipSyncModule.h"
#include "SampleBuffer.h"
#include "DSP/AudioFFT.h"
#include "Tasks/Pipe.h"
URuntimeVisemeGenerator::URuntimeVisemeGenerator()
: DataGuard(MakeShared<FCriticalSection>())
{
AudioTaskPipe = MakeUnique<UE::Tasks::FPipe>(*FString::Printf(TEXT("AudioTaskPipe_%s"), *GetName()));
ensureMsgf(AudioTaskPipe, TEXT("AudioTaskPipe is not initialized. This will cause issues with audio data appending"));
}
void URuntimeVisemeGenerator::BeginDestroy()
{
if (LipSyncProcessor)
{
LipSyncProcessor->CleanupContext(this);
LipSyncProcessor = nullptr;
}
Super::BeginDestroy();
}
URuntimeVisemeGenerator* URuntimeVisemeGenerator::CreateRuntimeVisemeGenerator()
{
return NewObject<URuntimeVisemeGenerator>();
}
void URuntimeVisemeGenerator::ProcessAudioData(TArray<float> PCMData, int32 SampleRate, int32 NumOfChannels)
{
if (NumOfChannels <= 0)
{
UE_LOG(LogRuntimeMetaHumanLipSync, Error, TEXT("Unable to mix audio data because the number of channels is invalid (%d)"), NumOfChannels);
return;
}
if (IsInGameThread())
{
AudioTaskPipe->Launch(AudioTaskPipe->GetDebugName(), [WeakThis = MakeWeakObjectPtr(this), PCMData = MoveTemp(PCMData), SampleRate, NumOfChannels]() mutable
{
if (WeakThis.IsValid())
{
WeakThis->ProcessAudioData(MoveTemp(PCMData), SampleRate, NumOfChannels);
}
else
{
UE_LOG(LogRuntimeMetaHumanLipSync, Error, TEXT("Unable to mix audio data because the runtime viseme generator object has been destroyed"));
}
}, UE::Tasks::ETaskPriority::BackgroundHigh);
return;
}
if (NumOfChannels > 1)
{
Audio::FAlignedFloatBuffer PCMData_AlignedFloatBuffer = Audio::FAlignedFloatBuffer(MoveTemp(PCMData));
MixChannelsRAWData(PCMData_AlignedFloatBuffer, SampleRate, NumOfChannels, 1, PCMData_AlignedFloatBuffer);
PCMData = MoveTemp(PCMData_AlignedFloatBuffer);
}
UE_LOG(LogRuntimeMetaHumanLipSync, VeryVerbose, TEXT("Processing audio data with %d sample rate and %d num of samples"), SampleRate, PCMData.Num());
if (!LipSyncProcessor)
{
static const FName LipSyncFeatureName(TEXT("RuntimeLipSyncProcessor"));
#if UE_VERSION_NEWER_THAN(5, 1, 0)
IModularFeatures::FScopedLockModularFeatureList ScopedLockModularFeatureList;
#endif
if (IModularFeatures::Get().IsModularFeatureAvailable(LipSyncFeatureName))
{
// Get the first available implementation
TArray<ILipSyncProcessor*> LipSyncProcessors = IModularFeatures::Get().GetModularFeatureImplementations<ILipSyncProcessor>(LipSyncFeatureName);
if (LipSyncProcessors.Num() > 0)
{
// Use the first available processor
LipSyncProcessor = LipSyncProcessors[0];
}
else
{
UE_LOG(LogRuntimeMetaHumanLipSync, Warning, TEXT("LipSync feature is available but no implementations were found"));
return;
}
}
else
{
UE_LOG(LogRuntimeMetaHumanLipSync, Warning, TEXT("No LipSync processor feature available. Please refer to the documentation on how to enable it: <https://docs.georgy.dev/runtime-metahuman-lip-sync/how-to-use-the-plugin>"));
return;
}
}
if (!LipSyncProcessor)
{
return;
}
UObject* ContextOwner = this;
// Initialize if needed
if (!LipSyncProcessorInitialized)
{
LipSyncProcessor->Initialize(ContextOwner, SampleRate, BufferSize);
LipSyncProcessor->SetAsyncCallback(ContextOwner, [this](const TArray<float>& NewVisemes, float NewLaughterScore)
{
FScopeLock Lock(&*DataGuard);
Visemes = NewVisemes;
LaughterScore = NewLaughterScore;
});
LipSyncProcessorInitialized = true;
}
// Process the audio data
LipSyncProcessor->ProcessFrameAsync(ContextOwner, (const float*)PCMData.GetData(), PCMData.Num());
}
TArray<float> URuntimeVisemeGenerator::GetVisemeWeights() const
{
FScopeLock Lock(&*DataGuard);
return Visemes;
}
TArray<FString> URuntimeVisemeGenerator::GetVisemeNames()
{
static TArray<FString> VisemeNames;
if (VisemeNames.Num() == 0)
{
VisemeNames.Add(TEXT("sil"));
VisemeNames.Add(TEXT("PP"));
VisemeNames.Add(TEXT("FF"));
VisemeNames.Add(TEXT("TH"));
VisemeNames.Add(TEXT("DD"));
VisemeNames.Add(TEXT("kk"));
VisemeNames.Add(TEXT("CH"));
VisemeNames.Add(TEXT("SS"));
VisemeNames.Add(TEXT("nn"));
VisemeNames.Add(TEXT("RR"));
VisemeNames.Add(TEXT("aa"));
VisemeNames.Add(TEXT("E"));
VisemeNames.Add(TEXT("ih"));
VisemeNames.Add(TEXT("oh"));
VisemeNames.Add(TEXT("ou"));
}
return VisemeNames;
}
void URuntimeVisemeGenerator::SetVisemeWeights(TArray<float> InWeights)
{
Visemes = MoveTemp(InWeights);
}
bool URuntimeVisemeGenerator::MixChannelsRAWData(Audio::FAlignedFloatBuffer& RAWData, int32 SampleRate, int32 SourceNumOfChannels, int32 DestinationNumOfChannels, Audio::FAlignedFloatBuffer& RemixedRAWData)
{
if (SampleRate <= 0)
{
UE_LOG(LogRuntimeMetaHumanLipSync, Error, TEXT("Unable to mix audio data because the sample rate is invalid (%d)"), SampleRate);
return false;
}
if (SourceNumOfChannels <= 0)
{
UE_LOG(LogRuntimeMetaHumanLipSync, Error, TEXT("Unable to mix audio data because the source number of channels is invalid (%d)"), SourceNumOfChannels);
return false;
}
if (DestinationNumOfChannels <= 0)
{
UE_LOG(LogRuntimeMetaHumanLipSync, Error, TEXT("Unable to mix audio data because the destination number of channels is invalid (%d)"), DestinationNumOfChannels);
return false;
}
// No need to mix if the number of channels are the same
if (SourceNumOfChannels == DestinationNumOfChannels)
{
RemixedRAWData = MoveTemp(RAWData);
return true;
}
Audio::TSampleBuffer<float> PCMSampleBuffer(RAWData, SourceNumOfChannels, SampleRate);
{
PCMSampleBuffer.MixBufferToChannels(DestinationNumOfChannels);
}
RemixedRAWData = Audio::FAlignedFloatBuffer(PCMSampleBuffer.GetData(), PCMSampleBuffer.GetNumSamples());
return true;
}