# Utilities for parseing SExtractor files
#
# H. Ferguson - revised 4/23/03 to promote ints to floats if a value
# with a decimal point appears somewhere in the column originally thought
# to be integers
# 
# v2.1 - fails gracefully when the catalog has no sources
# v3.0 - added gettypes to return column types
#      - create new column names when they are not explictly in the header
# v4.0 - added gettypes to return column types
# v4.1 - uses numarray by default
# v4.2 - delete attributed 'l' (input lines from catalog) before returning
# v4.3 - 1/11/06Added less-offensive alias se_catalog() == sextractor()
# v4.4h- 1/21/06 Fixed bug in creating extra column names when last is a vector
# v4.4 - V. Laidler added new methods:
#           __len__ returns number of objects in catalog
#           __iter__ returns the index of the next row in the catalog
#           line(self,i) returns a constructed string containing the ith line 
#           buildheader returns a constructed header from the hdict
#        Added new attribute self.header: contains the header as read in
#           from the catalog.
#        Lines that start with '#' but are not followed by an integer are
#           now assumed to be comment lines, which are added to the
#           header but otherwise skipped.
# v4.5 - V. Laidler removed Numeric dependence
# v4.6 - V. Laidler converted to numpy
# v5.0 - 7/5/07 Numpy conversion
# v6.0 - V. Laidler: added rw_catalog class, reworked internals to avoid
#           column name clashes

__version__ = '6.0'
__author = 'Henry C. Ferguson, STScI'

import string
import numpy as N
import os, sys

class se_catalog(object):
    """ Read a SExtractor-style catalog. 
        Usage: c=se_catalog(catalog,readfile=True,preserve_case=False)
        Will read the catalog and return an object c, whose attributes are 
        arrays containing the data. For example, c.mag_auto contains the 
        mag_auto values.
        Arguments:
        catalog -- The input SExtractor catalog. 
        readfile -- True means read the data. False means return the 
           object without reading the data. The lines from the catalog
           are returned as a list of ascii strings c.l. Useful if you want
           to do some special parsing of some sort. 
        preserve_case -- default (False) converts column names to lower case

        The input catalog MUST have a header with the SExtractor format:
           # 1 ID comment
           # 2 ALPHA_J200 another comment
        That is, first column is the comment symbol #, second column is
        the column number, third column is the column name, and the rest
        of the line is a comment. SExtractor allows "vectors" to be identified
        only by the first column...e.g.
           # 12 FLUX_APER
           # 20 FLUXERR_APER 
        the missing columns are all aperture fluxes through different
        apertures. These will be read into attributes:
           c.flux_aper   # The first one
           c.flux_aper_1 # the second one, and so on

        The case of aperture radii is a bit nasty, since these only
        appear in the SExtractor configuration file. Use parseconfig()
        to read that file.
    """
    def __init__(self,cfile,readfile=True,preserve_case=False):
        (self._d,self._l,self._ncolumns,self._header) = initcat(cfile,
                                                preserve_case=preserve_case)
        self._fname=cfile
        if readfile:
            self._colentries = range(len(self._l))
            for i in range(len(self._l)):
                self._colentries[i] = self._l[i].split()
            self.gettypes()
            for k in self._d.keys():
                contents = getcolvalues(self._d[k],self._type[k],
                                        self._colentries)
                try:
                    #Munge column name if it conflicts
                    test=self.__getattribute__(k)
                    newkey='c_'+k
                    print "--Column '%s' read in as '%s' to avoid conflicts"%(k,newkey)
                    setattr(self,newkey,contents)
                    self._d[newkey]=self._d[k]
                    del self._d[k]
                except AttributeError:
                    setattr(self,k,contents)
            delattr(self,'_l')

    def __len__(self):
        return len(self._colentries)

    def __iter__(self):
        return range(len(self._colentries)).__iter__()

    def line(self,i):
        """ Returns an assembled line of this catalog suitable for writing.
        Except it doesn't really, if we modified the individual columns..."""
        ans = '    '.join(self._colentries[i])+'\n'
        return ans

    def buildheader(self):
        """ Reconstruct the header from the header dictionary.
        This might be useful if only a few columns were selected
        from the file; otherwise just use the 'header' attribute. """
        
        lines = {}
        for k in self._d:
            lines[self._d[k]]='#   %d   %s'%(self._d[k],k.upper())
        #sort the new keys
        nkeys= lines.keys()
        nkeys.sort()
        #join them together with newlines
        ans = ''
        for k in nkeys:
            ans=ans+"%s\n"%lines[k]
        return ans
        

    def getcol(self,col,offset=0):
        column = self._d[col]
        return getcol(column+offset,self._l)
    def getcols(self,*args):
        ret = []
        for i in range(len(args)):
           ret = ret + [getcol(self._d[args[i]],self._l)]
        return ret
    def gettypes(self):
        self._type = {}
        for k in self._d.keys():
           ret = getcol(self._d[k],self._l[:100])
           t = type(ret)
           if t == type(N.array([1])):
               if ret.dtype.char == 'i' or ret.dtype.char == 'l':
                  t = type(1)
               if ret.dtype.char == 'd':
                  t = type(1.e99)
           self._type[k] = t
#          print k, t


class sextractor(se_catalog): # Just an alias for class se_catalog
    """ Read SExtractor catalog...just an alias for se_catalog """
    pass

class rw_catalog(se_catalog):
    """ Extend the se_catalog class to support adding new columns,
    and writing out the new version."""

    def __init__(self, fname):
        self._modflag=False #this flag will be set by add_column routines
        self._fname = fname
        self._colnames=[]
        se_catalog.__init__(self,fname,
                                      readfile=True,preserve_case=False)
        coldict = invert_dict(self._d)
        for k in coldict:
            self._colnames.append(coldict[k])
            
    def addcolumn(self, colname, coldata):
        """ coldata must be a 1d numarray of the correct length"""
        if len(coldata) != len(self):
            raise ValueError,"Column length must match catalog length"

        #Most of the bookkeeping is the same as for an empty column
        self.addemptycolumn(colname,coldata.dtype)

        #and then we reset the column to contain the actual data
        setattr(self,colname,coldata)


    def addemptycolumn(self, colname, coltype):
        """ Defines a new column & updates all the bookkeeping, but
        does not actually fill in the data. """
        setattr(self,colname,N.zeros((len(self),),coltype))
        self._modflag=True
        self._type[colname]=coltype

        #Looks strange here because we count columns from 1 but
        #Python counts them from 0
        self._ncolumns+=1
        self._d[colname]=self._ncolumns
        self._colnames.append(colname)
        self._header+='#  %d  %s\n'%(self._ncolumns,colname)

    def line(self,rownum):
        """ Construct a new line as to be printed out """
        if not self._modflag:
            return se_catalog.line(self,rownum)
        else:
            linelist=[]
            for c in self._colnames:
                col=getattr(self,c)
                linelist.append(str(col[rownum]))
            line='    '.join(linelist)+'\n'
            return line

    def writeto(self,outname,clobber=False):
        if not clobber:
            if os.path.isfile(outname):
                raise ValueError, """File already exists.
                   Use .writeto(fname, clobber=True) to overwrite. """

        out=open(outname,'w')

        out.write(self._header)
        for k in range(len(self)):
            out.write(self.line(k))
        out.close()

    def printme(self):
        """ Like writeto, but for sys.stdout """
        sys.stdout.write(self._header)
        for k in range(len(self)):
            sys.stdout.write(self.line(k))

def invert_dict(d):
    """ Generate a new dictionary with the key/value relationship inverted """
    newd={}
    for k in d:
        newd[d[k]]=k
    return newd

def parseconfig_se(cfile):
  """ parseconfig -- read a SExtractor .sex file and return a dictionary
    of options & values. Comments are ignored.
  """
  cdict = {}
  f = open(cfile,'r')
  lines = f.readlines()
  for l in lines:
    a = string.split(l)
    if len(a) > 0:
      if a[0][0] != '#':
        maxi = len(a)
        for i in range(1,len(a)):
          if a[i][0] == '#':
            maxi = i
            break
        # Turn comma-separated lists into python lists
        entry = []
        for e in a[1:maxi]:
          if string.find(e,','):
            entry = entry + string.split(e,',')
          else:
            entry = entry + [e]
        cdict[a[0]] = entry
  return cdict

def initcat(catfile,preserve_case=False):
  """ parseheader -- reads the header of a SExtractor catalog file and 
      returns a dictionary of parameter names and column numbers.
      Also returns a list of lines containing the data.
  """
  hdict = {}
  header=[]
  f = open(catfile,'r')
  lines = f.readlines()
  f.close()
  first = 1
  firstdata = 0
  i = 0
  previous_column=0
  previous_key=""
  for l in lines:
    if l.startswith('#'): #this is a header line        
        header.append(l)        
        a = (l.replace('#','# ')).split() #Guard against "#10 colname"
        try:
            col = int(a[1])
            # If the column numbers skip, create new column names for
            # columns not named explicitly in the header
            if col != previous_column+1:
              for c in range(previous_column+1,col):
                column_name = previous_key+"_%d" % (c-previous_column)
                hdict[column_name] = c
            # Update this column in the dictionary
            if (preserve_case):
                column_name = a[2]
            else:
                column_name = a[2].lower()
                hdict[column_name] = col
            firstdata = i+1
            previous_column = col
            previous_key = column_name
        except (ValueError, IndexError):
            #it's a comment line with no column number,
            #or an entirely blank comment line: skip
            pass
    
    else:  # This is where the data start
        a=string.split(l)
        if len(a)>0:
            if first:
              firstdata = i
              first = 0
            # Check if there are extra columns
            if len(a) > previous_column: 
            # If so, add keys for the last entry
              for c in range(previous_column+1,len(a)):
                column_name = previous_key+"_%d" % (c-previous_column)
                if (preserve_case):
                    hdict[column_name] = c
                else:
                    hdict[column_name] = c.lower()
            ncolumns = len(a)
    i = i+1
  return(hdict,lines[firstdata:],ncolumns,''.join(header))

def getcol(col,lines):
  """ Get a column from a SExtractor catalog. Determine the type
      (integer, float, string) and return either an array of that
      type (Int32, Float64) or a list of strings """
  i = col-1
  nlines = len(lines)
  if len(lines) == 0:
    values = N.array([])
    return values
  a = string.split(lines[0])
  if string.find(a[i],'.') < 0:
    try:
      x = int(a[i]) 
    except:
      values = range(nlines)
      getstrings(col,lines,values)
    else:
      values = N.zeros((nlines),N.int32)
      if type(getints(col,lines,values)) == type(-1):
        values = N.zeros((nlines),N.float64)
        getfloats(col,lines,values)
  else:
    try:
      x = float(a[i]) 
    except:
      values = range(nlines)
      getstrings(col,lines,values)
    else:
      values = N.zeros((nlines),N.float64)
      getfloats(col,lines,values)
  return values

def getcolvalues(col,coltype,colentries):
  """ Get a column from a SExtractor catalog. Determine the type
      (integer, float, string) and return either an array of that
      type (Int32, Float64) or a list of strings """
  i = col-1               # Columns start at 1, arrays start at 0
  nlines = len(colentries)
  if len(colentries) == 0:
    values = N.array([])
    return values
  a = colentries	
  
  if coltype == type([]):	  # Convert strings
      values = range(nlines)
      for j in range(nlines):
          values[j] = colentries[j][i]

  if coltype == type(1.0):	# Convert floats 
      values = N.zeros((nlines),N.float64)
      for j in range(nlines):
          values[j] = float(colentries[j][i])

  if coltype == type(1):	# Convert Ints
      values = N.zeros((nlines),N.int32)
      for j in range(nlines):
          values[j] = int(colentries[j][i])
  return values

def getstrings(col,lines,values):
  n = 0
  for l in lines:
    a = string.split(l)
    values[n] = a[col-1]
    n = n+1

def getints(col,lines,values):
  n = 0
  for l in lines:
    a = string.split(l)
    if string.find(a[col-1],'.') > 0:
      return -1
    else:
      values[n] = int(a[col-1])
    n = n+1
  return values    


def getfloats(col,lines,values):
  n = 0
  for l in lines:
    a = string.split(l)
    values[n] = float(a[col-1])
    n = n+1

def getcols(d,l,*args):
   """ Get multiple columns from SExtractor list using getcol() """
   ret = []
   for i in range(len(args)):
      ret = ret + [getcol(d[args[i]],l)]
   return ret

def writeheader(fh,colnames):
    """ Write an SExtractor-style header to an open file handle.

    @param fh: file handle
    @type fh: file

    @param colnames: list of column names
    @type colnames: list

    @todo: add space checking to colnames
    @todo: permit passing a filename?
    @todo: handle comments
    """
    for i in range(len(colnames)):
        fh.write('#   %d   %s\n'%(i+1,colnames[i]))