#!/usr/bin/python

# Script written by Fred Hoyt, 2007-2012

############
# Overview #
############
#
# This script is for use with the script 
# get_ldc_sounds.praat for Praat
#  
# http://www.fon.hum.uva.nl/praat/
#
# It also calls the 'sph2pipe' utility for converting
# audio files in NIST .sph format to .wav format: 
#
# https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/ctools/sph2pipe_v2.5.tar.gz
# 
# The Praat script and sph2pipe should be in the same 
# directory as the text files being processed. 
#
######################
# Input Data Formats #
######################
#
# The script is written to be used with STDIN from 
# a list of lines from Linguistic Data Consortium (LDC) 
# audio corpus transcript data with a file name prefixed to each line. 
#
# For example:
#
# fla_0001.A:AMR.B:BRT.txt:001.00 002.00 A(AMR:F:mid): my foot <noise> hurts 
#
# "fla_0001.A:AMR.B:BRT.txt" is a file name. This particular file name is
# the LDC file name suffixed with metadata about the speakers in the file
# (see below for a linux shell script for doing this):  
#
#"Speaker A" is American (AMR) and "Speaker B" is British (BRT). 
# The "001.00 002.00" is the time stamp of the line in the audio file 
# and the "A(AMR:F:mid)" further specifies that Speaker A is female
# and middle-aged. 
# 
# Including metadata in the file name is not necessary
# for the working of the script, although it is 
# useful for keeping tracking of the files that the script
# produces. What the script does require is that the names
# of transcript files including in the input data lists
# be identical to or at least include the file names 
# of the audio files that they correspond to. For example,
# given an audio file abc_001.wav, the corresponding
# transcript file should be names either abc_001.txt 
# or abc_001:OtherStuff.txt. 
#
################
# How it works #
################
#
# The way the script works is that each line of STDIN
# is read, the transcript file name is extracted 
# and the corresponding audio file
# is found at the end of the path specified by arg1. 
# The script then calls a Praat script which excises 
# The segment of the audio corresponding to the time stamp
# does some Praat stuff to it. 
#
# For example, given a list of transcript file lines 'foo.txt'
# containing the following lines:
#
# fla_0001.A:AMR.B:BRT.txt:001.00 002.00 A(AMR:F:mid): my foot <noise> hurts
# fla_0002.A:AMR.B:BRT.txt:001.00 002.00 B(AMR:F:mid): oh that sucks
#
# and a directory bar/ of audio files (in LDC .sph format) containing
# the following files corresponding to the transcript files 
# referenced in 'foo.txt': 
#
# fla_0001.A:AMR.B:BRT.sph
# fla_0002.A:AMR.B:BRT.sph
#
# Assuming that foo.txt and bar/ are located 
# in the same directory as the necessary script files, 
# the script is invoked as follows (preceded by 'ls' command
# to show verify that the script files are present in
# the local directory): 
#
# $ ls ./
# bar foo.txt get_ldc_data.py get_ldc.sounds.praat sph3pipe
# $ cat foo.txt | get_ldc_data.py bar 
#
# This produces a Praat "collection" corresponding 
# to each transcript-audio file pair and containing
# a Praat textgrid aligned with "sound" (audio segment)
# in .wav format:
#
# fla_0001.A:AMR.B:BRT.Collection
# fla_0002.A:AMR.B:BRT.Collection
#
# Note that, according to the scripts as written, 
# the script files should be in the same directory
# as the transcript files

##########
# Script #
##########

# take stdin as input (in other words, pipe from cat) and 1 argument
# arg1 = absolute path to audio data.  

import re
import sys
import os
import shutil

# get path to audio data from arg1
# IMPORTANT: The path should include the file name 
# WITHOUT any extension
# e.g. /home/corpora/corpus/audio/
inpath = str(sys.argv[1])
audioin = str(sys.argv[2])
audioout = str(sys.argv[3])

# read in STDIN (pipe from cat)
# e.g. $ cat foo.txt | python get_corpus_data.py bar/ wav wav 
line = sys.stdin.readline()

# Define a big regex to match with each line of STDIN.
# The expressions are grouped as follows (using "\(" and "\)" for grouping:
# 
# \(file_NNNN.txt\): \(NNN.NN\) \(NNN.NN\) \(AB\)\((NAT:MF:AGE)\): \(Data\)
#
# the first group is the file name, the second group is the start
# of the time stamp, the third group is end of the time stamp, 
# the fourth group is the tag ("A" or "B") of the speaker
# the fifth group is the metadata for the speaker
# the sixth group is the data line itself
# See README file for more details. 

file_match = re.compile(r'([a-zA-Z]{1,}\_[0-9]{1,4})[\.\:a-zA-Z0-9]*\.txt\:([0-9]{1,4}\.[0-9]{1,2}) ([0-9]{1,4}\.[0-9]{1,2}) ([AB])\(([a-zA-Z]{2,4}\:[FMU]\:[a-zA-Z0-9]{1,3})\)\: (.*)$')

# for each line
while line:
    #strip out white space
    line = line.strip()
    # if the line begins with a LDC file name
    if file_match.match(line):
        # then start cutting up the line based on groupings in the regex
        match = file_match.match(line)
        file = match.group(1) #the file name
        start = match.group(2) #the beginning of the time stamp
        end = match.group(3) #the end of the time stamp
        speaker = match.group(4) #speaker tag
        info = match.group(5) #speaker data
	#the actual datum
        datum_string = match.group(6) 
	#replace "\" with the null string
        datum_string = re.sub(r'\\','',datum_string) 
	#re-compile the datum string with ","
        datum_exp = re.compile(r'"%s"' % datum_string)
	#extract the pattern of the datum string
        datum = datum_exp.pattern 
	# strip out white space again 
        # line = line.strip() 
	#combine groups 1-3 to get file name
        filename = '%s_%s-%s_%s.txt' % (file,start,end,info) 
	# print to command line to indicate progress
        print filename 
	# convert LDC audio file from sph format to wav format for the sake of Praat
	if audioin == 'sph':
	        command = './sph2pipe -f rif %s%s.sph %s.%s' % (inpath,file,file,audioout)
	elif audioin == audioout:
       		command = 'cp %s%s.%s %s.%s' % (inpath,file,audioin,file,audioin)
 	else: 
		command = './sox %s%s.%s %s.%s' % (input,file,audioin,file,audioout)
	os.system(command)
	# call Praat script
        command = '/usr/bin/praat get_corpus_sounds.praat %s %s %s %s %s %s %s' % (audioout,file,start,end,speaker,info,str(datum))
        os.system(command) 
        # Remove audio files to save space
        command = 'rm *wav'
        os.system(command)
    else:
        pass
        print "No match found."

    # and so on...
    line = sys.stdin.readline()
