# Utilities for reading column-oriented files # # H. Ferguson - revised 10/20/03 to allow use of Numeric or numarray # H. Ferguson - revised 2/10/08 to use numpy # H. Ferguson - revised to add some options and remove Numeric and numarray """Routines for reading general whitespace-delimited, column-oriented files. Returned values are either numpy one-dimensional arrays. The read routines parse the input looking for decimal points or non-numeric characters to decide on the format of the output. Reading is therefore a bit slow, but the program interface is extremely simple. For example, if the file 'foo' has three columns, read them in as follows: a,b,c = fgetcols('foo') A few other options: a,b,c,d = fgetcols('foo',1,3,5,7) # Read some selected columns a = fgetcols('foo') # Read all the columns (a is then a tuple of arrays) a,b,c = fgetcols('foo',fs=',') # Change the field separator to a comma a,b,c = fgetcols('foo',cmt='!') # Change the comment character to '!' The module also provides an object-oriented interface to save re-reading the file if multiple getcol calls are desired: f = readcol('foo') a,b = f.getcols(1,2) c,d = f.getcols(3,4) f.close() Ignores comment lines. Ignores blank lines. Optionally changes INDEF to a desired value (e.g. -99.99). As of version 5.0, only numpy is offered (Numeric and numarray used to be options). """ __version__ = '5.0' # Numpy is now the default __author = 'Henry C. Ferguson, STScI' import string import numpy def remove_comments(l,cmt='#'): comments = [] for i in range(len(l)): l[i] = l[i].strip() if l[i] == '\n' or l[i][0] == cmt: comments = comments + [i] ngone = 0 for i in comments: comment = l.pop(i-ngone) ngone = ngone+1 return l def replace_indef(l,indef): for i in range(len(l)): while string.find(l[i],"INDEF") > -1: idx = string.find(l[i],"INDEF") l[i] = l[i][:idx]+indef+l[i][idx+5:] return l class readcol: """Column-oriented file methods.""" def __init__(self,cfile,arraytype=numpy,indef=""): """Open file, read in all the lines, and return numpy arrays. Arguments: cfile -- file to read arraytype -- numpy (used to allow Numeric or numarray) indef -- string replacement for INDEF (e.g. NaN) """ f = open(cfile,'r') self.l = f.readlines() self.l = remove_comments(self.l) if indef: self.l = replace_indef(self.l,indef) f.close() self.N = arraytype def getcol(self,col,fs=None): """Read in a single column (columns start at 1).""" return getcol(col,self.l,self.N,fs=fs) def getcols(self,*args,**kwargs): """Read in a multiple columns (columns start at 1).""" if 'fs' in keywords.keys(): fs = keywords['fs'] else: fs = None ret = [] for i in range(len(args)): ret = ret + [getcol(args[i],self.l,self.N,fs=fs)] return ret def close(self): """Release the memory associated with the lines read by __init__""" del(self.l) def getcol(col,lines,N,fs=None): """Read in a single column from a list of strings. Parse each column to determine the type of variable (integer, float, string) and return either an array of that type (int64, float64) or a character array. Arguments: col -- desired column (starting at 1) lines -- list of strings (one per line) read from input file N -- numpy """ i = col-1 nlines = len(lines) if fs != None: # If delimiter is not whitespace, remove the whitespace oldlines = lines lines = [] for ol in oldlines: lines += [string.join(ol.split())] a = lines[0].split(fs) # Determine the type from the first line if string.find(a[i],'.') < 0: try: x = int(a[i]) except: values = range(nlines) getstrings(col,lines,values,fs=fs) values = N.array(values) else: values = N.zeros((nlines),N.int64) if type(getints(col,lines,values,fs=fs)) == type(1): values = N.zeros((nlines),N.float64) getfloats(col,lines,values,fs=fs) else: try: x = float(a[i]) except: values = range(nlines) getstrings(col,lines,values,fs=fs) values = N.array(values) else: values = N.zeros((nlines),N.float64) getfloats(col,lines,values,fs=fs) return values def getstrings(col,lines,values,fs=None): n = 0 for l in lines: a = l.split(fs) values[n] = a[col-1] n = n+1 def getints(col,lines,values,fs=None): n = 0 for l in lines: a = l.split(fs) if string.find(a[col-1],'.') > 0: return -1 else: values[n] = int(a[col-1]) n = n+1 return values def getfloats(col,lines,values,fs=None): n = 0 for l in lines: a = l.split(fs) values[n] = float(a[col-1]) n = n+1 def fgetcol(cfile,col,arraytype="numpy",cmt='#',indef="-99.99"): """Read in a single column from a file. Parse the column to determine the type of variable (integer, float, string) and return either an array of that type (int64, float64) or a character array. Arguments: cfile -- file to be read col -- desired column (starting at 1) arraytype -- numpy indef="-99.99" (INDEF replacement string) """ f = open(cfile,'r') l = f.readlines() f.close() l = remove_comments(l,cmt=cmt) if indef: l = replace_indef(l,indef) if arraytype == "numpy": N = numpy return getcol(col,l,N) def fgetcols(cfile,*args,**keywords): """Read multiple columns from a file. Parse each column to determine the type of variable (integer, float, string) and return either one-dimensional arrays of the appropriate type (int64, float64) or a character array. Arguments: cfile -- file to be read *args -- desired columns (starting at 1) **keywords -- indef="-99.99" (INDEF replacement string) -- cmt="#" (comment character) -- fs=None (field separator; defaults to whitespace) Examples: If the file 'foo' has three columns, read them in as follows: a,b,c = fgetcols('foo') A few other examples: a,b,c,d = fgetcols('foo',1,3,5,7) # read selected columns a = fgetcols('foo') # read all columns a,b,c = fgetcols('foo',fs=',') # Change the field separator a,b,c = fgetcols('foo',cmt='!') # Change the comment character to '!' """ f = open(cfile,'r') l = f.readlines() f.close() if 'cmt' in keywords.keys(): cmt = keywords['cmt'] else: cmt = '#' l = remove_comments(l,cmt=cmt) if 'indef' in keywords.keys(): indef = keywords['indef'] l = replace_indef(l,indef) N = numpy if 'arraytype' in keywords.keys(): arraytype = keywords['arraytype'] if arraytype != "numpy": print "readcol: As of v5.0, only numpy arrays are returned" if 'fs' in keywords.keys(): fs = keywords['fs'] else: fs = None ret = [] ncols = len(args) colnumbers = args if ncols == 0: # If no columns are listed, read them all ncols = len(l[0].split(fs)) colnumbers = N.array(range(ncols))+1 for i in range(ncols): ret = ret + [getcol(colnumbers[i],l,N,fs=fs)] return ret