/* * Copyright 1993-2007 NVIDIA Corporation. All rights reserved. */ /* WaveX02 project which demonstrates the basics on how to setup a project * example application. * HOST CODE. */ // includes, system #include #include #include #include #include #include #include #include // includes, project #include #include // Include, PortAudio #include "pa_win_ds.h" // includes, kernels #include #define SAMPLE_RATE (44100) #define FRAMES_PER_BUFFER (8192) #define NUM_CHANNELS (8) #define DITHER_FLAG (0) /**/ #define PA_SAMPLE_TYPE paInt16 #define SAMPLE_SIZE (2) //bytes #define SAMPLE_SILENCE (0) #define CLEAR(a) bzero( (a), FRAMES_PER_BUFFER * NUM_CHANNELS * SAMPLE_SIZE ) #define PRINTF_S_FORMAT "%d" //////////////////////////////////////////////////////////////////////////////// // declaration, forward void ReadFIRCoeff(void); //Read desired Frequency void ReadConfig(char * ConfigFile); //Read Configuration int ProcessWav(char *infilename, char *outfilename, void * userData); //Prototype Routine char InputFile[256]; char ConfigFile[256]; char OutputFile[256]; //parameters int DivideCount; //4 = 4Way FIR Divider int OutputFormat_Bit; //16 or 24 int SampleRate; //44100, bool StreamOut; //true:output to PortAudio bool FileOut; //true:output to File #define DEFTAPS 8192 struct FIRParams{ int DelayLength; //Channel Delay Sample Length int chPointer[2]; //Channel Processing Pointer for Wave Data (initialized negative for delay) bool Enabled; //Channel Used int Offset; //Offset in dB double dOffset; //Offset in double, to calculate char CoeffFile[256]; //Coeff File Name float FIRCoeff[DEFTAPS]; //Coeff Value }; struct FIRParams FIR[4]; const unsigned int CUDABLOCKS = 16; //User Data struct used between main routine and CallBack Function typedef struct { unsigned char * po; //Main Data Buffer unsigned long len; //file total length unsigned long pCounter; //current counter int numBytes; //Block Size FILE *f; unsigned long GPUCounter; //GPU Side counter } paTestData; //Stream CallBack function. //(1) Process Input : not used now //(2) Fill Output Buffer to pass PortAudio //(3) Return Status Flag, Continue or Complete static int patestCallback( const void *inputBuffer, void *outputBuffer, unsigned long framesPerBuffer, const PaStreamCallbackTimeInfo* timeInfo, PaStreamCallbackFlags statusFlags, void *userData ) { paTestData *data = (paTestData*)userData; char *out = (char *)outputBuffer; (void) timeInfo; /* Prevent unused variable warnings. */ (void) statusFlags; (void) inputBuffer; if ((data->pCounter > data->len) || (data->pCounter > (data->GPUCounter - data->numBytes))) { return paComplete; } else { if (data->GPUCounter < 1411200) //2 sec { //buffer not enough memset(out, 0, data->numBytes); } else { memcpy(out, (data->po)+(data->pCounter),data->numBytes); data->pCounter += data->numBytes; } return paContinue; } } //////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// int main( int argc, char** argv) { //Check Arguments Count if (argc != 4) { printf("requires 3 arguments.\n"); printf("WaveX02 inWaveFile ConfigFile OutWaveXFile\n"); printf("WaveX02 \"C:\\Temp\\MusicFile.wav\" \"C:\\Temp\\WaveX.cfg\" \"C:\\Temp\\MusicFileOut.wav\" \n"); return -1; } strncpy_s(InputFile,256, argv[1],strlen(argv[1])); strncpy_s(ConfigFile,256, argv[2],strlen(argv[2])); strncpy_s(OutputFile,256, argv[3],strlen(argv[3])); ReadConfig(ConfigFile); ReadFIRCoeff(); PaStreamParameters outputParameters; PaStream *stream = NULL; PaError err; int numBytes; int deviceIndex; paTestData data; int StreamStatus = 1; //Block size for output buffer. numBytes = FRAMES_PER_BUFFER * NUM_CHANNELS * SAMPLE_SIZE ; err = Pa_Initialize(); if( err != paNoError ) goto error; //Select Device which supports DirectSound API deviceIndex = Pa_GetHostApiInfo( Pa_HostApiTypeIdToHostApiIndex( paDirectSound ) )->defaultOutputDevice; printf( "using device id %d (%s)\n", deviceIndex, Pa_GetDeviceInfo(deviceIndex)->name ); outputParameters.device = deviceIndex; /* DirectSound output device */ printf( "Output device # %d.\n", outputParameters.device ); printf( "Output LL: %g s\n", Pa_GetDeviceInfo( outputParameters.device )->defaultLowOutputLatency ); printf( "Output HL: %g s\n", Pa_GetDeviceInfo( outputParameters.device )->defaultHighOutputLatency ); outputParameters.channelCount = NUM_CHANNELS; outputParameters.sampleFormat = PA_SAMPLE_TYPE; outputParameters.suggestedLatency = Pa_GetDeviceInfo( outputParameters.device )->defaultHighOutputLatency; //DirectSound Specific Stream setting PaWinDirectSoundStreamInfo WinDSInfo; WinDSInfo.size = sizeof(PaWinDirectSoundStreamInfo); WinDSInfo.hostApiType = paDirectSound; WinDSInfo.version = 1; WinDSInfo.channelMask = PAWIN_SPEAKER_7POINT1_SURROUND; //This is 0x063F channel mask WinDSInfo.flags = paWinDirectSoundUseChannelMask; outputParameters.hostApiSpecificStreamInfo = &WinDSInfo; /* -- check -- */ if( Pa_IsFormatSupported( 0, &outputParameters, SAMPLE_RATE ) == paFormatIsSupported ) { printf( "Pa_IsFormatSupported reports device will support %d channels.\n", NUM_CHANNELS ); } else { printf( "Pa_IsFormatSupported reports device will not support %d channels.\n", NUM_CHANNELS ); goto error; } data.pCounter = 0; data.numBytes = numBytes; data.GPUCounter = 0; err = Pa_OpenStream( &stream, NULL, //no input &inputParameters, &outputParameters, SAMPLE_RATE, FRAMES_PER_BUFFER, paClipOff, /* we won't output out of range samples so don't bother clipping them */ patestCallback, &data ); /* userData */ if( err != paNoError ) goto error; //start stream before process, let stream wait. err = Pa_StartStream( stream ); if( err != paNoError ) goto error; printf("stream started.\n"); //Process ProcessWav(InputFile, OutputFile, &data); //wait loop until stream completed. while (StreamStatus > 0) { Pa_Sleep( 1000 ); StreamStatus = Pa_IsStreamActive(stream); printf("waiting, stream. counter : %d \r", data.pCounter); } printf("\n"); printf("loopend, stream ending status: %d .\n", StreamStatus); free(data.po); printf("%s\n",Pa_GetErrorText(err)); err = Pa_StopStream( stream ); if( err != paNoError ) goto error; Pa_CloseStream( stream ); Pa_Terminate(); return 0; error: if( stream ) { Pa_AbortStream( stream ); Pa_CloseStream( stream ); } Pa_Terminate(); fprintf( stderr, "An error occured while using the portaudio stream\n" ); fprintf( stderr, "Error number: %d\n", err ); fprintf( stderr, "Error message: %s\n", Pa_GetErrorText( err ) ); return -1; } void ReadConfig(char* FileName) { char RetStr[256]; for (int i = 0; i < 4; i++) { FIR[i].Enabled = false; memset(FIR[i].CoeffFile, 0, sizeof(FIR[i].CoeffFile)); for (int j = 0; j < DEFTAPS; j++) { FIR[i].FIRCoeff[j] = 0.0; } } GetPrivateProfileString("Global_Params", "WAYS", "1", RetStr, 64, FileName); DivideCount = atoi(RetStr); GetPrivateProfileString("Global_Params", "FORMAT", "16", RetStr, 64, FileName); OutputFormat_Bit = atoi(RetStr); GetPrivateProfileString("Global_Params", "StreamOut", "0", RetStr, 64, FileName); if (strcmp("1",RetStr)==0) StreamOut = true; else StreamOut = false; GetPrivateProfileString("Global_Params", "FileOut", "0", RetStr, 64, FileName); if (strcmp("1",RetStr)==0) FileOut = true; else FileOut = false; GetPrivateProfileString("Global_Params", "SampleRate", "0", RetStr, 64, FileName); SampleRate = atoi(RetStr); GetPrivateProfileString("CH12", "COEFF", "", FIR[0].CoeffFile, 256, FileName); GetPrivateProfileString("CH34", "COEFF", "", FIR[1].CoeffFile, 256, FileName); GetPrivateProfileString("CH56", "COEFF", "", FIR[2].CoeffFile, 256, FileName); GetPrivateProfileString("CH78", "COEFF", "", FIR[3].CoeffFile, 256, FileName); GetPrivateProfileString("CH12", "Delay", "0", RetStr, 64, FileName); FIR[0].DelayLength = atoi(RetStr); GetPrivateProfileString("CH34", "Delay", "0", RetStr, 64, FileName); FIR[1].DelayLength = atoi(RetStr); GetPrivateProfileString("CH56", "Delay", "0", RetStr, 64, FileName); FIR[2].DelayLength = atoi(RetStr); GetPrivateProfileString("CH78", "Delay", "0", RetStr, 64, FileName); FIR[3].DelayLength = atoi(RetStr); GetPrivateProfileString("CH12", "Offset", "0", RetStr, 64, FileName); FIR[0].Offset = atoi(RetStr); GetPrivateProfileString("CH34", "Offset", "0", RetStr, 64, FileName); FIR[1].Offset = atoi(RetStr); GetPrivateProfileString("CH56", "Offset", "0", RetStr, 64, FileName); FIR[2].Offset = atoi(RetStr); GetPrivateProfileString("CH78", "Offset", "0", RetStr, 64, FileName); FIR[3].Offset = atoi(RetStr); for (int i = 0; i < 4; i++) { FIR[i].chPointer[0] -= FIR[i].DelayLength; FIR[i].chPointer[1] -= FIR[i].DelayLength; FIR[i].dOffset = pow(10.0, FIR[i].Offset/10.0); } for (int i = 0; i < DivideCount-1; i++) { FIR[i].Enabled = true; } } void ReadFIRCoeff(void) { FILE * f; char buf[64]; for (int j = 0; j < 4; j++) { fopen_s(&f, FIR[j].CoeffFile,"r"); if (f != NULL) { int i = 0; double tmp = 0.0; while((!feof(f)) && (i < DEFTAPS)) { fgets(buf, sizeof(buf), f); sscanf_s(buf, "%lf", &tmp); FIR[j].FIRCoeff[i] = (float)tmp; i++; } fclose(f); } } } unsigned read2bytes(FILE *f) { unsigned char buf[2]; if (fread(buf, 2, 1, f) != 1) { fprintf(stderr, "Read error\n"); return(1); } return 256U * buf[1] + buf[0] ; } unsigned long read4bytes(FILE *f) { unsigned char buf[4]; if (fread(buf, 4, 1, f) != 1) { fprintf(stderr, "Read error\n"); return(1); } return ((256LU * buf[3] + buf[2]) * 256LU + buf[1] ) * 256LU + buf[0] ; } int CheckInFile(FILE * f) { unsigned long len; unsigned char s[10]; int channels, bits; if (f == NULL) { printf("Can not open File\n"); return -1; } if (fread(s, 4, 1, f) != 1) { printf("Read error\n"); fclose(f); return -1; } if (memcmp(s, "RIFF", 4) != 0) { printf("Not a RIFF format\n"); fclose(f); return -1; } printf("[RIFF] (%lu bytes)\n", read4bytes(f)); if (fread(s, 8, 1, f) != 1) { printf("Read error\n"); fclose(f); return -1; } if (memcmp(s, "WAVEfmt ", 8) != 0) { printf("Not a WAVEfmt format\n"); fclose(f); return -1; } len = read4bytes(f); printf("[WAVEfmt ] (%lu bytes)\n", len); if (len != 16) { printf("Length of WAVEfmt must be 16\n"); return -1; } printf(" Data type = %u (1 = PCM)\n", read2bytes(f)); channels = read2bytes(f); printf(" Number of channels = %u (1 = mono, 2 = stereo)\n", channels); printf(" Sampling rate = %luHz\n", read4bytes(f)); printf(" Bytes per second = %lu\n", read4bytes(f)); printf(" Bytes per sample = %u\n", read2bytes(f)); bits = read2bytes(f); printf(" Bits per sample = %u\n", bits); return 0; } void PrepareOutFile(FILE * fo, unsigned long len, int channels, int Sample_sec, int BitFormat) { unsigned long temp_l; unsigned short temp_s; unsigned char s[20]; // Prepare Output Wave File // RIFF s[0] = 'R'; s[1] = 'I'; s[2] = 'F'; s[3] = 'F'; fwrite(s, 1, 4, fo); //filesize if (channels == 2) { temp_l = len + 36; } else //extended { temp_l = len + 120; } fwrite(&(temp_l), sizeof(long), 1, fo); // WAVE s[0] = 'W'; s[1] = 'A'; s[2] = 'V'; s[3] = 'E'; fwrite(s, 1, 4, fo); // fmt chunk s[0] = 'f'; s[1] = 'm'; s[2] = 't'; s[3] = ' '; fwrite(s, 1, 4, fo); // chunk size if (channels == 2) { temp_l = 16; } else //extended { temp_l = 40; } fwrite(&(temp_l), sizeof(long), 1, fo); // format PCM = 1 if (channels == 2) { temp_s = 1; } else //extended, 0xFFFE { temp_s = 0xFFFE; } fwrite(&(temp_s), sizeof(short), 1, fo); // channel stereo = 2, or Extended temp_s = channels; fwrite(&(temp_s), sizeof(short), 1, fo); // sample 44100, 48000, 88200, 96000 etc temp_l = Sample_sec; fwrite(&(temp_l), sizeof(long), 1, fo); // Bytes per sec temp_l = channels * BitFormat/8 * Sample_sec; fwrite(&(temp_l), sizeof(long), 1, fo); // Block (Bytes per sample&channel) temp_s = channels * BitFormat/8 ; fwrite(&(temp_s), sizeof(short), 1, fo); // Bits per sample temp_s = BitFormat ; fwrite(&(temp_s), sizeof(short), 1, fo); //Extended Format area if (channels > 2) { //22 bytes extended area temp_s = 22; fwrite(&(temp_s), sizeof(short), 1, fo); //bit width temp_s = BitFormat ; fwrite(&(temp_s), sizeof(short), 1, fo); //channel Mask. 2=0x03, 4=0x0F, 6=0x3F, 8=0xFF temp_l = (1<= 1) break; } if (dev == deviceCount) { fprintf(stderr, "There is no device supporting CUDA.\n"); exit(EXIT_FAILURE); } else cudaSetDevice(dev); } int ProcessWav(char *infilename, char *outfilename, void * userData) { //Variables //p : pointer for original wav data memory. (BYTE pointer) //WaveBytesLen : Total Data BYTEs in original wav file. //PacketLength : Processing in GPU, Data Count. handled data is float. //TotalDataSize : Resulting WAVx file, data chunk size. //MaxDelaySize : each channel has delay sampling count. most large one, it effects result size. //h_idata : memory, Host, to put into GPU. //InputDataLen : Posted Data length as float, = PacketLength + DEFTAPS. //d_coeff : memory, GPU, to store coefficient.size is 4*DEFTAPS //d_idata : memory, GPU, to store input wave data. length needs packetlen + coeff len. //d_odata : memory, GPU, result data stored. size is equal to input packet size. (not including coeffs tail size) //h_odata : memory, Host, to store result. there are total 8 (4Way * L,R) buffer. size is same as packet. //sWave_Out_Data : memory, Host, to store 16 bit WAVx format. type = short. PacketLength*sizeof(short)*2*DivideCount //wWave_Out_Data : memory, Host, to store 24 bit WAVx format. type = 3 bytes. PacketLength*sizeof(char)*3*2*DivideCount //FIR[j].DelayLength: Channel Delay, as Sample Length. //FIR[j].chPointer[k] : Current sampling pointer count for the channel. k=0(L),1(R) //FIR[j].Enabled : channel enabled flag. //FIR[j].FIRCoeff[m]: Channel FIR Coeff. //LoopCount : calculated main loop max count. ceiled to int. FILE *f, *fo; unsigned long WaveBytesLen; unsigned char s[10]; paTestData *data = (paTestData*)userData; init_CUDA(); printf("finename = '%s'\n", infilename); fopen_s(&f, infilename,"rb"); if (CheckInFile(f) == -1) return -1; while (fread(s, 4, 1, f) == 1) { WaveBytesLen = read4bytes(f); s[4] = 0; printf("[%s] (%lu bytes)\n", s, WaveBytesLen); if (memcmp(s, "data", 4) == 0) break; for (int i = 0; i < (int)WaveBytesLen; i++) printf("%02x ", fgetc(f)); printf("\n"); } const unsigned int PacketLength = DEFTAPS * 8; //File Data Size int TotalDataSize = 0; //Delay size int MaxDelaySize = 0; //Padding Size int PaddingLen = 0; //Sample Size int SampleCount = WaveBytesLen / 4; //Size of actually to be calculated int CalculationSize = 0; //Calculate File Length //TotalSize = (WaveLength + MaxDelay)*ways*Bytes for (int i = 0; i < DivideCount; i++) { if (FIR[i].DelayLength > MaxDelaySize) MaxDelaySize = FIR[i].DelayLength; } // Calculation have to be N x PacketLength. // Each calculation requires PacketLength + DEFTAPS samples. CalculationSize = (DEFTAPS + MaxDelaySize + SampleCount); PaddingLen = PacketLength - (CalculationSize % PacketLength); TotalDataSize = (CalculationSize + PaddingLen) * DivideCount * 2 * (OutputFormat_Bit/8) ; //File Output Mode, Prepare output file if (FileOut) { fopen_s(&fo, outfilename,"wb"); if (fo == NULL) { printf("Can not create %s\n", outfilename); return -1; } PrepareOutFile(fo, TotalDataSize, DivideCount * 2, SampleRate, OutputFormat_Bit); } //allocate memory for Straight Buffer. if (StreamOut) { data->po = (unsigned char *)malloc(TotalDataSize); VirtualLock(data->po, TotalDataSize); data->GPUCounter = 0; memset(data->po, 0, TotalDataSize); } //allocate memory for CalculationSize + Padding + Tailing TAPS area bytes. unsigned char * p; p = (unsigned char *)malloc((CalculationSize + PaddingLen + DEFTAPS)*sizeof(short)*2 ); VirtualLock(p, (CalculationSize + PaddingLen + DEFTAPS)*sizeof(short)*2 ); //fill top taps (L & R) for (int i = 0; i < DEFTAPS*2; i++) { (*(p + i)) = 0; } //Copy File to Memory, offset Head Taps area(L&R) int readsize = fread((p + DEFTAPS*2), 1, WaveBytesLen ,f); //Fill Tail with 0 for (int i = 0; i < sizeof(short)*2*(MaxDelaySize + PaddingLen + DEFTAPS); i++) { (*(p + DEFTAPS*2 + WaveBytesLen + i)) = 0; } // Now WAV file is ready in Host Memory. //Prepare Host and Device Memory float * h_idata; //Host Data to Go GPU int InputDataLen = PacketLength + DEFTAPS; cudaError_t cret; cret = cudaMallocHost((void**)&h_idata, sizeof(float)*InputDataLen); if (cret != CUDA_SUCCESS) { printf("Can not allocate %d bytes host memory.\n", sizeof(float)*InputDataLen); return -1; } //allocate GPU device input memory float * d_idata; CUDA_SAFE_CALL(cudaMalloc((void**) &d_idata, sizeof(float)*InputDataLen) ); // allocate GPU device memory for result float * d_odata; CUDA_SAFE_CALL(cudaMalloc((void**) &d_odata, sizeof(float)*PacketLength) ); //allocate host output (from GPU) memory float * h_odata[8]; //for 4 way * (L&R) for (int i = 0 ; i < 8; i++) { cret = cudaMallocHost((void**)&(h_odata[i]), sizeof(float)*PacketLength); if (cret != CUDA_SUCCESS) { printf("Can not allocate %d bytes host memory.\n", sizeof(float)*PacketLength); return -1; } } //allocate host WAVE format buffer //16bit buffer. 1L1L 1R1R 2L2L 2R2R 3L3L 3R3R 4L4L 4R4R short * sWave_Out_Data; cret = cudaMallocHost((void**)&sWave_Out_Data, PacketLength*sizeof(short)*2*DivideCount); if (cret != CUDA_SUCCESS) { printf("Can not allocate %d bytes host memory.\n", PacketLength*sizeof(short)*2*DivideCount); return -1; } //24bit buffer. 1L1L1L 1R1R1R 2L2L2L ... unsigned char * wWave_Out_Data; cret = cudaMallocHost((void**)&wWave_Out_Data, PacketLength*sizeof(char)*3*2*DivideCount); if (cret != CUDA_SUCCESS) { printf("Can not allocate %d bytes host memory.\n", PacketLength*sizeof(char)*3*2*DivideCount); return -1; } // setup execution parameters dim3 grid( CUDABLOCKS, 1, 1); dim3 threads( THREAD_NUM, 1, 1); //Loop for each Packet(total length/65536) int LoopCount = 0; LoopCount = (CalculationSize + PaddingLen) / PacketLength; for (int i = 0; i < LoopCount; i++) { //Loop for FIR[0], [1], [2], [3]: Needs to change Coeffs. for (int j = 0; j < DivideCount; j++) { //Loop for L, R: prepare 65536 words + 8192 words to process. for (int k = 0; k < 2; k++) { unsigned char LData1, LData2; short LData; for (int m = 0; m < InputDataLen; m++) { if (FIR[j].chPointer[k] >= 0) { //regular case. copy actual data. LData1 = (unsigned char) (* (p+FIR[j].chPointer[k]*4+k*2 ) ); LData2 = (unsigned char) (* (p+FIR[j].chPointer[k]*4+k*2 + 1) ); LData = (short)(LData1 + (LData2<<8)); h_idata[m] = ((float)(LData))/32768.0; FIR[j].chPointer[k] += 1; } else { //beggining, copy 0 for delay padding. h_idata[m] = 0.0; FIR[j].chPointer[k] += 1; } } //TAP area was copied to buffer, by m. get step back data pointer. FIR[j].chPointer[k] -= DEFTAPS; //Copy Wave Data to Device // copy host memory to device CUDA_SAFE_CALL( cudaMemcpy(d_idata, h_idata,sizeof(float)*InputDataLen,cudaMemcpyHostToDevice)); //copy coeffs to constant CUDA_SAFE_CALL( cudaMemcpyToSymbol(coeff_Kernel, FIR[j].FIRCoeff, sizeof(float)*DEFTAPS) ); //Call Kernel CUDA_SAFE_CALL( cudaThreadSynchronize() ); calcFIR<<>>(d_idata, d_odata, PacketLength); CUT_CHECK_ERROR("calcFIR failed"); CUDA_SAFE_CALL( cudaThreadSynchronize() ); //Get processed data to Packet Output Buffer CUDA_SAFE_CALL( cudaMemcpy( (void *)(h_odata[j*2 + k]), d_odata, sizeof(float)*PacketLength, cudaMemcpyDeviceToHost) ); }//L, R Loop End }//0,1,2,3 Loop End printf("Calculating %4d / %4d, GPUCounter = %d , pCounter = %d \r", i+1, LoopCount, data->GPUCounter ,data->pCounter); //8xPacket buffer ready //Copy to Wave Format Buffer double LValue; double RValue; // convert back to 16bit short for ( int j = 0; j < DivideCount; j++) { for (int i = 0; i < PacketLength; i++) { LValue = (h_odata[j*2 ][i]); RValue = (h_odata[j*2+1][i]); LValue *= FIR[j].dOffset; RValue *= FIR[j].dOffset; //Moved - acceralated Sigmoid Function if (LValue > 0.98) { LValue = 0.98+((1.0/(1.0 + exp(-20.0*(LValue-0.98))) - 0.5)*0.1); } if (LValue < -0.98) { LValue = -0.98- (0.5 - (1.0/(1.0 + exp(-20.0*(LValue+0.98))))) * 0.1; } if (RValue > 0.98) { RValue = 0.98+((1.0/(1.0 + exp(-20.0*(RValue-0.98))) - 0.5)*0.1); } if (RValue < -0.98) { RValue = -0.98- (0.5 - (1.0/(1.0 + exp(-20.0*(RValue+0.98))))) * 0.1; } sWave_Out_Data[i*DivideCount*2 + j*2 ] = (short)(LValue*32768.0); sWave_Out_Data[i*DivideCount*2 + j*2 + 1] = (short)(RValue*32768.0); } } if (FileOut) { //write packet data to file fwrite(sWave_Out_Data, sizeof(short), PacketLength*DivideCount*2, fo); //8192 * 8 channels * 2 byte per channel } if (StreamOut) { //write packet data to buffer //sWave = Short! x2 to get byte size memcpy(data->po + data->GPUCounter, sWave_Out_Data, PacketLength*DivideCount*2*2); data->GPUCounter += PacketLength*DivideCount*2*2; } //End Loop for Packet } printf("\ndone FIR processing.\n"); //Free Memory fclose(f); if (FileOut) fclose(fo); free(p); cudaFreeHost(h_idata); cudaFree(d_idata); cudaFree(d_odata); for (int i = 0; i < 8; i++) { cudaFreeHost(h_odata[i]); } cudaFreeHost(sWave_Out_Data); cudaFreeHost(wWave_Out_Data); return 0; }