#!/bin/sh

# Public domain notice for all NCBI EDirect scripts is located at:
# https://www.ncbi.nlm.nih.gov/books/NBK179288/#chapter6.Public_Domain_Notice

pma2pme=false

while [ $# -gt 0 ]
do
  case "$1" in
    pma2pme | -pma2pme | asn | -asn | asn1 | -asn1 | asn.1 | -asn.1 )
      pma2pme=true
      shift
      ;;
    * )
      break
      ;;
  esac
done

if [ "$#" -eq 0 ]
then
  echo "Must supply path to archive files"
  exit 1
fi

archive="$1"
shift

native="$archive"

osname=`uname -s | sed -e 's/_NT-.*$/_NT/; s/^MINGW[0-9]*/CYGWIN/'`
if [ "$osname" = "CYGWIN_NT" -a -x /bin/cygpath ]
then
  archive=`cygpath -w "$archive"`
fi

archive=${archive%/}
native=${native%/}

deleteCitations() {
  inp="$1"
  pmidlist=.TO-REPORT
  delenda=.TO-DELETE
  delasn=.TO-DELASN
  cat "$inp" |
  xtract -pattern DeleteCitation -block PMID -tab "\n" -sep "." -element "PMID" |
  sort -n | uniq > $pmidlist
  cat "$pmidlist" |
  rchive -trie -gzip |
  sort -n | uniq > $delenda
  if [ -s $delenda ]
  then
    (cd "$native" && xargs rm -f) < $delenda
    cat "$delenda" | sed -e 's/xml/asn/g' > $delasn
    if [ -s $delasn ]
    then
      (cd "$native" && xargs rm -f) < $delasn
    fi
  fi
  if [ -s $pmidlist ]
  then
    cat "$pmidlist" >> "$native/deleted.uid"
  fi
  rm -f $pmidlist
  rm -f $delenda
  rm -f $delasn
}

reportVersioned() {
  inp="$1"
  pmidlist=.TO-REPORT
  xtract -input "$inp" -pattern PubmedArticle \
    -block MedlineCitation/PMID -if "@Version" -gt 1 -element "PMID" |
  sort -n | uniq > $pmidlist
  if [ -s $pmidlist ]
  then
    cat "$pmidlist" >> "$native/versioned.uid"
  fi
  rm $pmidlist
}

rm -f "versioned.xml.gz"
rm -f "versioned.snt"

needToReport=true
timeout=100
if [ "$pma2pme" = true ]
then
  timeout=100
fi

for fl in *.xml.gz
do
  base=${fl%.xml.gz}
  if [ -f "$base.snt" ]
  then
    continue
  fi
  secnds_start=$(date "+%s")
  echo "$base.xml"
  gunzip -c "$fl" |
  transmute -compress -strict -wrp PubmedArticleSet \
    -pattern "PubmedArticleSet/*" -format flush > "$base.xml"
  rchive -gzip -input "$base.xml" -archive "$archive" \
    -index MedlineCitation/PMID^Version -pattern PubmedArticle
  if [ "$pma2pme" = true ]
  then
    cat "$base.xml" | pma2pme -xml > "$base.asn"
    rchive -asn -gzip -input "$base.asn" -archive "$archive" \
      -index Pubmed-entry/pmid_ -pattern Pubmed-entry
    rm "$base.asn"
  fi
  deleteCitations "$base.xml"
  reportVersioned "$base.xml"
  touch "$base.snt"
  rm "$base.xml"
  secnds_end=$(date "+%s")
  secnds=$((secnds_end - secnds_start))
  if [ "$needToReport" = true ]
  then
    if [ "$secnds" -gt "$timeout" ]
    then
      echo ""
      echo "ARCHIVING IS SLOWER THAN EXPECTED."
      echo ""
      echo "PLEASE ENSURE THAT ANTIVIRUS SCANNING AND CONTENT INDEXING ARE DISABLED,"
      echo "AND THAT TRIM SUPPORT IS ENABLED FOR THE SOLID STATE DRIVE."
      echo ""
      needToReport=false
    fi
  fi
done
