Source code for rios.calcstats

"""
This module creates pyramid layers and calculates statistics for image
files. Much of it was originally for ERDAS Imagine files but should work 
with any other format that supports pyramid layers and statistics

"""
# This file is part of RIOS - Raster I/O Simplification
# Copyright (C) 2012  Sam Gillingham, Neil Flood
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

import os
import warnings
import numpy
from osgeo import gdal
from . import cuiprogress
from .rioserrors import ProcessCancelledError

# When calculating overviews (i.e. pyramid layers), default behaviour
# is controlled by these
dfltOverviewLvls = os.getenv('RIOS_DFLT_OVERVIEWLEVELS')
if dfltOverviewLvls is None:
    DEFAULT_OVERVIEWLEVELS = [4, 8, 16, 32, 64, 128, 256, 512]
else:
    DEFAULT_OVERVIEWLEVELS = [int(i) for i in dfltOverviewLvls.split(',')]
DEFAULT_MINOVERVIEWDIM = int(os.getenv('RIOS_DFLT_MINOVERLEVELDIM', default=33))
DEFAULT_OVERVIEWAGGREGRATIONTYPE = os.getenv('RIOS_DFLT_OVERVIEWAGGTYPE', 
    default="AVERAGE")


[docs]def progressFunc(value, string, userdata):
    """
    Progress callback for BuildOverviews
    """
    percent = (userdata.curroffset + (value / userdata.nbands) * 100)
    userdata.progress.setProgress(percent)
    if value == 1.0:
        userdata.curroffset = userdata.curroffset + 100.0 / userdata.nbands
    return not userdata.progress.wasCancelled()


# make userdata object with progress and num bands
[docs]class ProgressUserData(object):
    pass


[docs]def addPyramid(ds, progress, 
        minoverviewdim=DEFAULT_MINOVERVIEWDIM, 
        levels=DEFAULT_OVERVIEWLEVELS,
        aggregationType=None):
    """
    Adds Pyramid layers to the dataset. Adds levels until
    the raster dimension of the overview layer is < minoverviewdim,
    up to a maximum level controlled by the levels parameter. 
    
    Uses gdal.Dataset.BuildOverviews() to do the work. 
    
    """
    progress.setLabelText("Computing Pyramid Layers...")
    progress.setProgress(0)
    
    # ensure everything is written to disc first
    ds.FlushCache()

    # first we work out how many overviews to build based on the size
    if ds.RasterXSize < ds.RasterYSize:
        mindim = ds.RasterXSize
    else:
        mindim = ds.RasterYSize
    
    nOverviews = 0
    for i in levels:
        if (mindim // i) > minoverviewdim:
            nOverviews = nOverviews + 1

    # Need to find out if we are thematic or continuous. 
    tmpmeta = ds.GetRasterBand(1).GetMetadata()
    if aggregationType is None:
        if 'LAYER_TYPE' in tmpmeta:
            if tmpmeta['LAYER_TYPE'] == 'athematic':
                aggregationType = "AVERAGE"
            else:
                aggregationType = "NEAREST"
        else:
            aggregationType = DEFAULT_OVERVIEWAGGREGRATIONTYPE
    
    userdata = ProgressUserData()
    userdata.progress = progress
    userdata.nbands = ds.RasterCount
    userdata.curroffset = 0

    ds.BuildOverviews(aggregationType, levels[:nOverviews], progressFunc, userdata)
  
    if progress.wasCancelled():
        raise ProcessCancelledError()

    # make sure it goes to 100%
    progress.setProgress(100)


[docs]def findOrCreateColumn(ratObj, usage, name, dtype):
    """
    Returns the index of an existing column matched
    on usage. Creates it if not already existing using 
    the supplied name and dtype
    Returns a tupe with index and a boolean specifying if 
    it is a new column or not
    """
    ncols = ratObj.GetColumnCount()
    for col in range(ncols):
        if ratObj.GetUsageOfCol(col) == usage:
            return col, False

    # got here so can't exist
    ratObj.CreateColumn(name, dtype, usage)
    # new one will be last col
    return ncols, True


gdalLargeIntTypes = set([gdal.GDT_Int16, gdal.GDT_UInt16, gdal.GDT_Int32, gdal.GDT_UInt32])
# hack for GDAL 3.5 and later which suppport 64 bit ints
if hasattr(gdal, 'GDT_Int64'):
    gdalLargeIntTypes.add(gdal.GDT_Int64)
    gdalLargeIntTypes.add(gdal.GDT_UInt64)

gdalFloatTypes = set([gdal.GDT_Float32, gdal.GDT_Float64])


[docs]def addStatistics(ds, progress, ignore=None, approx_ok=False):
    """
    Calculates statistics and adds them to the image
    
    Uses gdal.Band.ComputeStatistics() for mean, stddev, min and max,
    and gdal.Band.GetHistogram() to do histogram calculation. 
    The median and mode are estimated using the histogram, and so 
    for larger datatypes, they will be approximate only. 
    
    For thematic layers, the histogram is calculated with as many bins 
    as required, for athematic integer and float types, a maximum
    of 256 bins is used. 
    
    """
    progress.setLabelText("Computing Statistics...")
    progress.setProgress(0)
    percent = 0
    percentstep = 100.0 / (ds.RasterCount * 2)  # 2 steps for each layer

    # flush the cache. The ensures that any unwritten data is 
    # written to file so we get the right stats. It also 
    # makes sure any metdata is written on HFA. This means
    # the LAYER_TYPE setting will be picked up by rat.SetLinearBinning()
    ds.FlushCache()

    # The GDAL HFA driver has a bug in its SetLinearBinning function,
    # which was introduced as part of the RFC40 changes. Until
    # this is fixed and widely distributed, we should disable the use
    # of RFC40-style techniques for HFA files.
    driverName = ds.GetDriver().ShortName
    disableRFC40 = (driverName == 'HFA')
  
    for bandnum in range(ds.RasterCount):
        band = ds.GetRasterBand(bandnum + 1)

        # fill in the metadata
        tmpmeta = band.GetMetadata()
    
        if ignore is not None:
            # tell QGIS that the ignore value was ignored
            band.SetNoDataValue(ignore)
            tmpmeta["STATISTICS_EXCLUDEDVALUES"] = repr(ignore)  # doesn't seem to do anything
      
        # get GDAL to calculate statistics - force recalculation. Trap errors 
        usingExceptions = gdal.GetUseExceptions()
        gdal.UseExceptions()
        try:
            if approx_ok and "LAYER_TYPE" in tmpmeta and tmpmeta["LAYER_TYPE"] == "thematic": 
                warnings.warn('WARNING: approx_ok specified for stats but image is thematic (this could be a bad idea)')

            (minval, maxval, meanval, stddevval) = band.ComputeStatistics(approx_ok)
        except RuntimeError as e:
            if str(e).endswith('Failed to compute statistics, no valid pixels found in sampling.'):
                minval = ignore
                maxval = ignore
                meanval = ignore
                stddevval = 0
            else:
                raise e
        finally:
            if not usingExceptions:
                gdal.DontUseExceptions()

        percent = percent + percentstep
        progress.setProgress(percent)
    
        tmpmeta["STATISTICS_MINIMUM"] = repr(minval)
        tmpmeta["STATISTICS_MAXIMUM"] = repr(maxval)
        tmpmeta["STATISTICS_MEAN"] = repr(meanval)
        tmpmeta["STATISTICS_STDDEV"] = repr(stddevval)
        # because we did at full res - these are the default anyway

        if approx_ok:
            tmpmeta["STATISTICS_APPROXIMATE"] = "YES"
        else:
            tmpmeta["STATISTICS_SKIPFACTORX"] = "1"
            tmpmeta["STATISTICS_SKIPFACTORY"] = "1"

        # create a histogram so we can do the mode and median
        if band.DataType == gdal.GDT_Byte:
            # if byte data use 256 bins and the whole range
            histmin = 0
            histmax = 255
            histstep = 1.0
            histCalcMin = -0.5
            histCalcMax = 255.5
            histnbins = 256
            tmpmeta["STATISTICS_HISTOBINFUNCTION"] = 'direct'
        elif "LAYER_TYPE" in tmpmeta and tmpmeta["LAYER_TYPE"] == 'thematic':
            # all other thematic types a bin per value
            histmin = 0
            histmax = int(numpy.ceil(maxval))
            histstep = 1.0
            histCalcMin = -0.5
            histCalcMax = maxval + 0.5
            histnbins = histmax + 1
            tmpmeta["STATISTICS_HISTOBINFUNCTION"] = 'direct'
        elif band.DataType in gdalLargeIntTypes:
            histrange = int(numpy.ceil(maxval) - numpy.floor(minval)) + 1
            (histmin, histmax) = (minval, maxval)
            if histrange <= 256:
                histnbins = histrange
                histstep = 1.0
                tmpmeta["STATISTICS_HISTOBINFUNCTION"] = 'direct'
                histCalcMin = histmin - 0.5
                histCalcMax = histmax + 0.5
            else:
                histnbins = 256
                tmpmeta["STATISTICS_HISTOBINFUNCTION"] = 'linear'
                histCalcMin = histmin
                histCalcMax = histmax
                histstep = float(histCalcMax - histCalcMin) / histnbins
        elif band.DataType in gdalFloatTypes:
            histnbins = 256
            (histmin, histmax) = (minval, maxval)
            tmpmeta["STATISTICS_HISTOBINFUNCTION"] = 'linear'
            histCalcMin = minval
            histCalcMax = maxval
            if histCalcMin == histCalcMax:
                histCalcMax = histCalcMax + 0.5
                histnbins = 1
            histstep = float(histCalcMax - histCalcMin) / histnbins
        # Note that the complex number data types are not handled, as I am not sure
        # what a histogram or a median would mean for such types. 
      
        userdata = ProgressUserData()
        userdata.progress = progress
        userdata.nbands = ds.RasterCount * 2
        userdata.curroffset = percent
      
        # Get histogram and force GDAL to recalculate it. Note that we use include_out_of_range=True,
        # which is safe because we have calculated the histCalcMin/Max from the data. 
        hist = band.GetHistogram(histCalcMin, histCalcMax, histnbins, True,
                        approx_ok, progressFunc, userdata)
        
        # Check if GDAL's histogram code overflowed. This is not a fool-proof test,
        # as some overflows will not result in negative counts. 
        histogramOverflow = (min(hist) < 0)
        
        # we may use this ratObj reference for the colours below also
        # may be None if format does not support RATs
        ratObj = band.GetDefaultRAT()

        if not histogramOverflow:
            # comes back as a list for some reason
            hist = numpy.array(hist)

            # Note that we have explicitly set histstep in each datatype case 
            # above. In principle, this can be calculated, as it is done in the 
            # float case, but for some of the others we need it to be exactly
            # equal to 1, so we set it explicitly there, to avoid rounding
            # error problems. 

            # do the mode - bin with the highest count
            modebin = numpy.argmax(hist)
            modeval = modebin * histstep + histmin
            if band.DataType == gdal.GDT_Float32 or band.DataType == gdal.GDT_Float64:
                tmpmeta["STATISTICS_MODE"] = repr(modeval)
            else:
                tmpmeta["STATISTICS_MODE"] = repr(int(round(modeval)))

            if ratObj is not None and not disableRFC40:
                histIndx, histNew = findOrCreateColumn(ratObj, gdal.GFU_PixelCount, 
                                        "Histogram", gdal.GFT_Real)
                # write the hist in a single go
                ratObj.SetRowCount(histnbins)
                ratObj.WriteArray(hist, histIndx)

                ratObj.SetLinearBinning(histmin, (histCalcMax - histCalcMin) / histnbins)

                # The HFA driver still honours the STATISTICS_HISTOBINVALUES
                # metadata item. If we are recalculating the histogram the old
                # values will be copied across with the metadata so clobber it
                if "STATISTICS_HISTOBINVALUES" in tmpmeta:
                    del tmpmeta["STATISTICS_HISTOBINVALUES"]
            else:
                # Use GDAL's original metadata interface, for drivers which
                # don't support the more modern approach
                tmpmeta["STATISTICS_HISTOBINVALUES"] = '|'.join(map(repr, hist)) + '|'

                tmpmeta["STATISTICS_HISTOMIN"] = repr(histmin)
                tmpmeta["STATISTICS_HISTOMAX"] = repr(histmax)
                tmpmeta["STATISTICS_HISTONUMBINS"] = repr(histnbins)

            # estimate the median - bin with the middle number
            middlenum = hist.sum() / 2
            gtmiddle = hist.cumsum() >= middlenum
            medianbin = gtmiddle.nonzero()[0][0]
            medianval = medianbin * histstep + histmin
            if band.DataType == gdal.GDT_Float32 or band.DataType == gdal.GDT_Float64:
                tmpmeta["STATISTICS_MEDIAN"] = repr(medianval)
            else:
                tmpmeta["STATISTICS_MEDIAN"] = repr(int(round(medianval)))
    
        # set the data
        band.SetMetadata(tmpmeta)

        if ratObj is not None and not ratObj.ChangesAreWrittenToFile():
            # For drivers that require the in memory thing
            band.SetDefaultRAT(ratObj)

        percent = percent + percentstep
        progress.setProgress(percent)

        if progress.wasCancelled():
            raise ProcessCancelledError()
    
    progress.setProgress(100)
    
    
[docs]def calcStats(ds, progress=None, ignore=None,
        minoverviewdim=DEFAULT_MINOVERVIEWDIM, 
        levels=DEFAULT_OVERVIEWLEVELS,
        aggregationType=None, approx_ok=False):
    """
    Does both the stats and pyramid layers. Calls addPyramid()
    and addStatistics() functions. See their docstrings for details. 
    
    """
    if progress is None:
        progress = cuiprogress.SilentProgress()
    
    if ignore is not None:
        setNullValue(ds, ignore)

    addPyramid(ds, progress, minoverviewdim=minoverviewdim, levels=levels, 
        aggregationType=aggregationType)

    addStatistics(ds, progress, ignore, approx_ok=approx_ok)


[docs]def setNullValue(ds, nullValue):
    """
    Set the given null value on all bands of the given Dataset
    """
    for i in range(ds.RasterCount):
        band = ds.GetRasterBand(i + 1)
        band.SetNoDataValue(nullValue)