Sunday, September 13, 2015

Remove BOM (Byte-order mark) from a file

Got this script from following page:
http://thegreyblog.blogspot.in/2010/09/shell-script-to-find-and-remove-bom.html


#!/bin/bash


set -o nounset
set -o errexit


DELETE_ORIG=true
DELETE_FLAG=""
RECURSIVE=false
PROCESSALLFILE=false
PROCESSING_FILES=false
PROCESSALLFILE_FLAG=""
SED_EXEC=sed
USE_EXT=false
FILE_EXT=""
TMP_CMD="mktemp"
TMP_OPTS="--tmpdir="
XDEV=""
ISDARWIN=false


if [ $(uname) == "SunOS" ] ; then
  if [ -x /usr/gnu/bin/sed ] ; then
    echo "Using GNU sed..."
    SED_EXEC=/usr/gnu/bin/sed
  fi
  TMP_OPTS="-p "
fi


if [ $(uname) == "Darwin" ] ; then
  TMP_OPTS="-t tmp"

  SED_EXEC="perl -pe"
  echo "Using perl..."
  ISDARWIN=true

fi


function usage() {
  echo "bom-remove [-adrx] [-s sed-name] [-e ext] files..."
  echo ""
  echo "  -a    Remove the BOM throughout the entire file."
  echo "  -e    Look only for files with the chosen extensions."
  echo "  -d    Do not overwrite original files and do not remove temp files."
  echo "  -r    Scan subdirectories."
  echo "  -s    Specify an alternate sed implementation."
  echo "  -x    Don't descend directories in other filesystems."
}


function checkExecutable() {
  if ( ! which "$1" > /dev/null 2>&1 ); then
    echo "Cannot find executable:" $1
    exit 4
  fi
}


function parseArgs() {
  while getopts "adfrs:e:x" flag
  do
    case $flag in
      a) PROCESSALLFILE=true ; PROCESSALLFILE_FLAG="-a" ;;
      r) RECURSIVE=true ;;
      f) PROCESSING_FILES=true ;;
      s) SED_EXEC=$OPTARG ;;
      e) USE_EXT=true ; FILE_EXT=$OPTARG ;;
      d) DELETE_ORIG=false ; DELETE_FLAG="-d" ;;
      x) XDEV="-xdev" ;;
      *) echo "Unknown parameter." ; usage ; exit 2 ;;
    esac
  done


  shift $(($OPTIND - 1))



  if [ $# == 0 ] ; then
    usage;
    exit 2;
  fi



  # fixing darwin
  if [[ $ISDARWIN == true && $PROCESSALLFILE == false ]] ; then
    PROCESSALLFILE=true
    echo "Process all file is implicitly set on Darwin."
  fi

  FILES=("$@")


  if [ ! -n "$FILES" ]; then
    echo "No files specified. Exiting."
  fi


  if [ $RECURSIVE == true ]  && [ $PROCESSING_FILES == true ] ; then
    echo "Cannot use -r and -f at the same time."
    usage
    exit 1
  fi


  checkExecutable $SED_EXEC
  checkExecutable $TMP_CMD
}


function processFile() {
  if [ $(uname) == "Darwin" ] ; then
    TEMPFILENAME=$($TMP_CMD $TMP_OPTS)
  else
    TEMPFILENAME=$($TMP_CMD $TMP_OPTS"$(dirname "$1")")
  fi
  echo "Processing $1 using temp file $TEMPFILENAME"


  if [ $PROCESSALLFILE == false ] ; then
    cat "$1" | $SED_EXEC '1 s/\xEF\xBB\xBF//' > "$TEMPFILENAME"
  else
    cat "$1" | $SED_EXEC 's/\xEF\xBB\xBF//g' > "$TEMPFILENAME"
  fi


  if [ $DELETE_ORIG == true ] ; then
    if [ ! -w "$1" ] ; then
      echo "$1 is not writable. Leaving tempfile."
    else
      echo "Removing temp file..."
      mv "$TEMPFILENAME" "$1"
    fi
  fi
}


function doJob() {
  # Check if the script has been called from the outside.
  if [ $PROCESSING_FILES == true ] ; then
    for i in $(seq 1 ${#FILES[@]})
    do
      echo ${FILES[$i-1]}
      processFile "${FILES[$i-1]}"
    done


  else
    # processing every file
for i in $(seq 1 ${#FILES[@]})
do
CURRFILE=${FILES[$i-1]}
      # checking if file or directory exist
      if [ ! -e "$CURRFILE" ] ; then echo "File not found: $CURRFILE. Skipping..." ; continue ; fi
     
      # if a paremeter is a directory, process it recursively if RECURSIVE is set
      if [ -d "$CURRFILE" ] ; then
        if [ $RECURSIVE == true ] ; then
          if [ $USE_EXT == true ] ; then
            find "$CURRFILE" $XDEV -type f -name "*.$FILE_EXT" -exec "$0" $DELETE_FLAG $PROCESSALLFILE_FLAG -f "{}" \;
          else
            find "$CURRFILE" $XDEV -type f -exec "$0" $DELETE_FLAG $PROCESSALLFILE_FLAG -f "{}" \;
          fi
        else
          echo "$CURRFILE is a directory. Skipping..."
        fi
      else
        processFile "$CURRFILE"
      fi
    done
  fi
}


parseArgs "$@"
doJob


Examples
Assuming the script is in your $PATH and it's called bom-remove, you can "clean" a bunch of files invoking it this way:

$ bom-remove file-to-clean ...

If you want to clean the files in an entire directory, you can use the following syntax:

$ bom-remove -r dir-to-clean ...

If your sed installation is not in your $PATH or you have to use an alternate version, you can invoke the script with the following syntax:

$ bom-remove -s path/to/sed file-to-clean ...

If you want to clean a directory in which other file systems might be mounted, you can use the -x option so that the script does not descend them:

$ bom-remove -xr dir-to-clean ...


Seriously!!! This literally saved my day!!!


No comments:

Post a Comment