#!/usr/local/bin/perl -w

###############################################################
# vamp_txt2xml                                                #
#                                                             #
# Component of VAMP                                           #
#                                                             #
# Copyright (C) 2005 Institut Curie                           #
# Author(s): Philippe La Rosa (Institut Curie) 2005           #
# Contact: Philippe.La-Rosa@curie.fr                          #
# It is strictly forbidden to transfer, use or re-use this    #
# code or part of it without explicit written authorization   #
# from Institut Curie.                                        #
###############################################################
#                                                             #
# Philippe La Rosa le 12/12/2005                              #
#                                                             #
# Component of  interface VAMP                                #
#                                                             #
# Generation of files XML which will be taken in entry of the #
# interface VAMP.                                             #
# This generation is obtained starting from a data file in the#
# format csv which result from results of the analysis of a   #
# chip CGH-ARRAYS or TRANSCRIPTOME, or LOH, or SNPs, or...    #
#                                                             #
###############################################################
# Obligatory parameters :                                     #
# -path_name : the path of the input file (cvs)               #
# -chip_name : the name of the chip (without space)           #
# -xml : output folder XML                                    #
# -project_name : the name of the project                     #
###############################################################
#  Optional parameters :                                      #
# -id_project : Id of the project                             #
# -organism : Organism (human, mouse, ...)                    #
# -team : The name of the team                                #
# -number_histo : The histological number                     #
# -type_chip : The type of chip  (CGH, TRS)                   #
# -type_object : The type of object (Clone,  ProbeSet)        #
# -mode_ratio : Mode of the ratio M ou L (ratio or LogRatio)  #
# -center_pos : Mode of calculation for the position of the   #
#  object, in the center if the option is equal true, and at  #
# the beginning by default.                                   #
###############################################################
# The input file must be with the format cvs (description of  #
# the fields on the first line) and separator: , (comma).     #
# The fields of the first line will be tags XML.              #
#   example :                                                 #
# Y, X, Chr, Name, Smt, Bkp, Out, Gnl                         #
# -0.13,     13850, 1, IMAGE:322807, 0.00,  0,  0,  0         #
# -0.10,     71204, 1, IMAGE:782434, 0.00,  0,  0,  0         #
#  0.05,    167764, 1, IMAGE:190915, 0.00,  0,  0,  0         #
# -0.04,    187455, 1, IMAGE:810801, 0.00,  0,  0,  0         #
# -0.07,    236684, 1, IMAGE:742132, 0.00,  0,  0,  0         #
#  0.07,    526125, 1, IMAGE:364685, 0.00,  0,  0,  0         #
#                                                             #
# - tags obligatory: Y, X, Chr, Name for CGH                  #
#                   : Signal, PosBegin, PosEnd, Chr, ObjectId #
#                     for Transcriptome                       #
#                                                             #
# Significance: X: position on the genome (must be ordered)   #
#                   Y : value of ratio;                       # 
#                   Chr : name of the chromosome;             # 
#                   Name of the object (clone, probeset,...)  #
# - The following tags allow the taking into account in VAMP  #
#   and thus the use of the results of algorytme GLAD         #
#   (Phupe et al. 2004).                                      #
#   They must name imperatively:  : Smt, Bkp, Out, Gnl        #
# Significance: Smt : value of Smoothing;                     #
#               Bkp  : Breakpoints;                           #
#               Out : Outliers;                               #
#               Gnl : ZoneGNL (Gain, Normal, Loss)            #
# All other Tags and thus all the other fields will be        #
# visible in the windows of information of the interface.     #
###############################################################

use strict;
use Getopt::Long;
use English ;
use File::stat;
use POSIX qw(strftime);

$PROGRAM_NAME  =~ m![^/]+$! ;
my $DEFAULT_TYPE_PUCE = "CGH";
my $pathName;
my $puceName; 
my $chemxml;  
my $project;
my $id_project;
my $numhisto;
my $organism;
my $team;
my $type_puce;
my $type_object;
my $obj_key;
my $mode_ratio;
my $center_pos;
my $image_software;
my @tchr;
my @tags;
my %GenChr;

##########################################
# Usage                                  #
#                                        #
# Parametre(s) :                         # 
#                                        #
##########################################
sub usage {
    die   "Usage: $PROGRAM_NAME -path_name path of the input file (cvs) -chip_name name of the chip -xml output repertory XML -project_name name of the project [-id_project project id, -img_software image software, -number_histo histological number, -organism organism (human, mouse, ...), -team name of the team, -type_chip type of chip (CGH, TRS, ...), -type_object type of object (Clone,  ProbeSet), -mode_ratio (L or M), -obj_key key for single name], -center_pos true\n\n";
}

# Parse les arguments et les options de la ligne de commande
GetOptions("path_name=s" => \$pathName, "chip_name=s" => \$puceName, "xml=s" => \$chemxml, "project_name=s" => \$project, "id_project=s" => \$id_project, "organism=s" => \$organism, "team=s" => \$team, "number_histo=s" => \$numhisto, "type_chip=s" => \$type_puce, "type_object=s" => \$type_object, "mode_ratio=s" => \$mode_ratio, "center_pos=s" => \$center_pos, "obj_key=s" => \$obj_key, "img_software=s" => \$image_software) or usage;

usage if ( (!$pathName) || (!$puceName) || (!$chemxml) || (!$project) );


##########################################
# Creation d'un tableau de data, du      #
# tableau de TAGS et du tableau de       #
# description des chromosomes a partir   #
# du fichier d'entrée.                   #
#                                        #
# Parametre(s) :                         # 
# Nom du fichier                         #                    
##########################################
# retour : nombre d'objets (clones, ...) #
##########################################
sub getData {
    my ($pathName) = @_;
    my $line;
    my $nb_line;
    my $nb_object;
    my $posChr;
    my @ret;
    my $TAG_CHROMOSOME = "Chr";
    my %tuniq;

    open(FICTXT, $pathName) or die "Can't open $pathName ($!)\n";
    while ($line = <FICTXT>)
    {
      chomp($line);
      #$line =~ s/ //g;
      # construction du tableau de description des tags
      if (!$nb_line) {
	  (@tags) = split(/\,/, $line);
          my $i;
          for($i = 0; $i < scalar(@tags); $i++) {
	      $posChr = $i if ($tags[$i] eq $TAG_CHROMOSOME);
	  }
      }
      else {
          my (@recup) = split(/\,/, $line);
          # construction du tableau de description des chromosomes
          if (not exists($tuniq{$recup[$posChr]})){
	      $tuniq{$recup[$posChr]} = $recup[$posChr];
              my $numchr = $recup[$posChr];
              @{$GenChr{$numchr}} = ();
              $numchr = sprintf("%02d", $recup[$posChr]) if ( ($recup[$posChr] ne 'X') and ($recup[$posChr] ne 'Y') );
              push(@tchr, $numchr);
	  }
          # ajout dans un tableau de tableau de chromosomes les objets correspondants
          my $numchr = $recup[$posChr];
          push( @{$GenChr{$numchr}}, $line);
          $nb_object++;
      }
      $nb_line++
    }
    close FICTXT;
    @tchr = sort @tchr;
    my $i;
    for($i = 0; $i < scalar(@tchr); $i++) {
      $tchr[$i] =~ s/^0//g;
    }
    return ($nb_object);
}


##########################################
# Creation Xml.                          #
#                                        #
# Parametre(s) :                         #
# le nom de la puce                      #
# le nom du chromosome                   #
# le nom du projet                       #
# l'id du projet                         #
# le numero histo                        #
# l'adresse url du fichier               #
# le nombre d'objets (clone)             #
# le date d'analyse                      #
# l'organisme                            #
# l'equipe                               #
# le mode de ratio                       #
# le tableau de data                     #                    
##########################################
# retour : chaine XML.                   #
##########################################
sub createXml {
    my ($puceName, $chr, $project, $id_project, $numhisto, $chemurlall, $objectNumber, $dateAnalyse, $organism, $team, $mode_ratio, $type_puce, $type_object) = @_;
    my $ORGANISM = "Human";
    my $DEFAULT_TYPE_OBJECT = "Clone";
    my $TOP = "Array";
    my $CORPS = "Obj";
    my $TEAM = "Public";
    my $MODRATIO = "L";
    my $OBJ_KEY = "Name";
    my $IMG_SOFT = "NA";
    my $NBREP = 3;
    my $line;
    my $ret;
    my $td;
    my $spotNumber = ($objectNumber * $NBREP);
    $organism = $ORGANISM if (!$organism);
    $team = $TEAM if (!$team);
    $mode_ratio = $MODRATIO if (!$mode_ratio);
    $type_object = $DEFAULT_TYPE_OBJECT if (!$type_object);
    $obj_key = $OBJ_KEY if (!$obj_key);
    $image_software = $IMG_SOFT if (!$image_software);

    if ($type_puce eq "TRS") {
	$obj_key = "ObjectId";
    }

    $ret = qq |<$TOP>
<Team>$team</Team>
<Organism>$organism</Organism>
<Project>$project</Project>
<ProjectId>$id_project</ProjectId>
<NumHisto>$numhisto</NumHisto>\n|;
        $ret .= qq |<SampleAdditionalData URL="additional/$id_project/$numhisto.xml"/>\n| if (($numhisto ne "NA") and ($id_project ne "NA"));
        $ret .= qq |<Chr>$chr</Chr>
<Name>$puceName</Name>
<Date>$dateAnalyse</Date>
<Type>$type_puce</Type>
<ObjKey>$obj_key</ObjKey>
<ImgSoftware>$image_software</ImgSoftware>
<NbSpot>$spotNumber</NbSpot>
<NbRep>$NBREP</NbRep>
<Ratio>$mode_ratio</Ratio>
<Url>$chemurlall</Url>\n|;
$ret .= qq |<CenterPos>True</CenterPos>\n| if ($center_pos);
$ret .= qq |<NbObj>$objectNumber</NbObj>\n|;

 
    for($td = 0; $td < scalar(@{$GenChr{$chr}}); $td++){
      $line = @{$GenChr{$chr}}[$td];
      my (@recup) = split(/\,/, $line);
      if (scalar(@recup)) {
	  my $u;
	  $ret .= qq |<$CORPS>\n<Properties>\n<Type>$type_object</Type>|;
	  for($u = 0; $u < scalar(@recup); $u++)
	  {
             # pour enlever l'eventuel espace de debut
             #my (@recup1) = split(/ /, $recup[$u]);
             #my $tr = scalar(@recup1);
             #$recup[$u] = $recup1[$tr - 1];
                
             # pour enlever les eventuelle "" 
             if ($recup[$u] =~ /\"/)
             {
		my (@re) = split(/\"/, $recup[$u]);
                $recup[$u] = $re[1];
	     }
             $ret .= qq |<$tags[$u]>$recup[$u]</$tags[$u]>| if ($tags[$u]);
	  }
          $ret .= qq |\n</Properties>\n</$CORPS>\n|;
        }
    }

    # affichage fin
    $ret .= qq |</$TOP>\n|;

  return $ret;
}

  $id_project = $project if (!$id_project);
  $numhisto = $puceName if (!$numhisto);
  $type_puce = $DEFAULT_TYPE_PUCE if (!$type_puce);
  my $time = strftime"%d%m%Y%M%S",localtime;
  my $file_tmp = "/tmp/" . $time . rand($time);
  my $st = stat($pathName) or die "Can't acces $pathName ($!)\n";
  my $dateAnalyse = localtime($st->ctime);
  my $objectNumber = &getData($pathName);
  my $baseurl = `basename $chemxml`;
  chomp $baseurl;

  
  # test existence rep all, projet, projet/chr, projet/array: sinon creation
  my $chem_all = $chemxml . "/all";
  my $chem_project = $chemxml . "/$project";
  my $chem_projectchr = $chemxml . "/$project/chr";
  my $chem_projectarray = $chemxml . "/$project/array";
   
  if ($type_puce eq "TRS") {
      my $chem_idproject = $chemxml . "/$id_project";
      mkdir $chem_idproject if (!-e $chem_idproject);
      $chem_project = $chemxml . "/$id_project/$numhisto";
      $chem_projectchr = $chemxml . "/$id_project/$numhisto/chr";
      $chem_projectarray = $chemxml . "/$id_project/$numhisto/array";
  }
  mkdir $chem_project if (!-e $chem_project);
  mkdir $chem_all if (!-e $chem_all);
  mkdir $chem_projectchr if (!-e $chem_projectchr);
  mkdir $chem_projectarray if (!-e $chem_projectarray);

 
  # creer fichier dans /projet/array/puce.xml avec les balise d'entete
  my $chemficarray = $chem_projectarray . "/" . $puceName . ".xml";
  open(FILEARRAY," > $chemficarray") or die "can't create 1 $chemficarray ($!)\n";
      printf FILEARRAY qq |<?xml version='1.0' encoding='iso-8859-1'?> \n|;
      printf FILEARRAY qq |<ArraySet> \n|;
      printf FILEARRAY qq |<SetName>$puceName</SetName> \n|;

  my $u;
  for($u = 0; $u < scalar(@tchr); $u++)
  {
    # test existence rep all/chr$u : sinon creation 
    my $numchr = $tchr[$u];
    if ($type_puce ne "TRS") {
	$numchr = sprintf("%02d", $tchr[$u]) if ( ($tchr[$u] ne 'X') and ($tchr[$u] ne 'Y') );
    }
    my $chem_allchr = $chem_all . "/chr$numchr";
    mkdir $chem_allchr if (!-e $chem_allchr);

    # test existence fichier projet/chr/chr$u :
    my $chemficchr = $chem_projectchr . "/" ."chr$numchr" . ".xml";
    if (-e $chemficchr) {
         # - si oui :
         #   - sauvegarde contenu fichier sauf balise de fermeture dans fichier intermediaire /tmp/$deb (systeme)
         system "egrep -v '</ArraySet>' $chemficchr > $file_tmp";
    }
    else {
         # - si non :
         #   - creation fichier intermediaire /tmp/$deb avec des balise d'entete 
         #   - close /tmp/$deb 
	 open(FILECHR," > $file_tmp") or die "can't create $chemficarray ($!)\n";
         printf FILECHR qq |<?xml version='1.0' encoding='iso-8859-1'?> \n|;
         printf FILECHR qq |<ArraySet> \n|;
         printf FILECHR qq |<SetName>chr$numchr</SetName> \n|;
         close(FILECHR);
    }

    # creer fichier dans all/chr$u/puce.xml avec les balise d'entete
    my $ficall = $chem_allchr . "/$puceName.xml";
    open(FILEALL," > $ficall") or die "can't create $chemficarray ($!)\n";
    printf FILEALL qq |<?xml version='1.0' encoding='iso-8859-1'?> \n|;
    printf FILEALL qq |<ArraySet> \n|;
    printf FILEALL qq |<SetName>$puceName</SetName> \n|;

    my $chemurlall = $baseurl . "/all/chr$numchr/$puceName.xml";

    my $corps = &createXml($puceName, $tchr[$u], $project, $id_project, $numhisto, $chemurlall, $objectNumber, $dateAnalyse, $organism, $team, $mode_ratio, $type_puce, $type_object);
   
    # ajout de $corps dans all/chr$u/puce.xml
    printf FILEALL qq |$corps \n|;
    # ajout balise de fermeture dans all/chr$u/puce.xml
    printf FILEALL qq |</ArraySet> \n|;
    # close all/chr$u/puce.xml
    close(FILEALL);

    # pour fichier /projet/array/puce.xml
    # ajout de $corps dans /projet/array/puce.xml
    printf FILEARRAY qq |$corps \n|;

    # pour fichier projet/chr/chr$u :
    # ouverture de /tmp/$deb
    open(FILECHR," >> $file_tmp") or die "can't open $chemficarray ($!)\n";
    # ajout de $corps dans fichier /tmp/$deb

    printf FILECHR qq |$corps \n|;
    # ajout balise de fermeture dans /tmp/$deb
    printf FILECHR qq |</ArraySet> \n|;
    # close /tmp/$deb
    close(FILECHR);
    # creation ou ecrasement du fichier projet/chr/chr$u avec copie de /tmp/$deb (systeme : mv )
    system "mv $file_tmp $chemficchr ";
  }

  #   ajout balise de fermeture dans /projet/array/puce.xml
  #   close /projet/array/puce.xml
  printf FILEARRAY qq |</ArraySet> \n|;
  close(FILEARRAY);