// Help on writing an option file for the jPREdictor.
// 
// General statements:
// - The text after each  ';' (semicolon),  '#' (hashmark) or  '//' (double
//   slash) is treated as comment and  is therefore inclusive the delimiter
//   ignored.   As well are spaces,  tab-stopps and  white spaces,  but not
//   within names.
// - The case of a parameter word is ignored,  thus 'Name', 'NAME', 'nAmE',
//   'name' would be the same. This does not hold for user given text, e.g.
//   'name=MotifName' is different from 'name=Motifname'.
// 
// Section explanation:
// - This file is parted into sections each starting with a predefined word
//   in square brackets, e.g. [Motif].
// - All sections can occur more than once, e.g. if seven motifs have to be
//   specified seven [Motif] sections are needed.
// - Known sections are:
//   * [Sequence] - sets global parameters and sequences to use
//   * [Motif] - contains a motif and informations about it
//   * [MultiMotif] - combines distantly connected motifs into groups
//   * [MultiMotifList] - short term definition of multi motifs
// - Note,  that the option file is top-down evaluated.  Therefore,  motifs
//   used in the sections [MultiMotif] and [MultiMotifList] must be defined
//   previously.
// 
// Motif definition:
// - Single motifs  (i.e. motifs  corresponding to a  binding site)  can be
//   defined using the section [Motif].
// - The sections [MultiMotif] and [MultiMotifList] are meant for combining
//   single motifs and multi motifs into motifs of greater order.
// - Multi motifs consist of single motifs and of other multi motifs, which
//   itself should not be used for searching and scoring operations
// - Between  successive motifs in a  multi motif the  allowed minimal  and
//   maximal distance must be given.  Such distance is always messured from
//   the end of the first to the beginning of the latter motif.
// - The control over  which motifs are used for  searching and scoring can
//   be achieved by using the parameter  'usable'.  Motifs generated in the
//   section [MultiMotifList] are always used for such operations,  whereas
//   motifs from section [Motif] have (per default)  'usable' set to false,
//   and motifs from section [MultiMotif] have 'usable' set to true.
// 
// Parameters:
// - Used to specify the  meaning of the following text,  e.g. after 'Name'
//   follows a name.
// - All parameters names must be followed by '=',  e.g. 'name = MotifTen'.
// - Leading and trailing spaces and tabs are ignored, e.g. 'name=Mot  Ten'
//   or ' name =  Mot  Ten ' is the same - as name is 'Mot  Ten' evaluated.
// 
// - Parameters recognized in [Sequence]
//   * sequence_filename (optional) - the filename of the  sequence used to
//     search and score. Can be given only once.
//   * positive_training_set_filename (optional) - the filename which gives
//     the sequence(s) representing the positive training set. Can be given
//     only once.
//   * negative_training_set_filename (optional) - the filename which gives
//     the sequence(s) representing the negative training set. Can be given
//     only once.
//   * windowWidth  (optional) - gives the width of a  section (=window) on
//     the sequence in which searching and scoring is done. Default is 500.
//   * windowShift  (optional) - specifies the value by  which this section
//     (=window) is shifted on the sequence. Default is 100.
//   * global_background (optional) - specifies a character distribution to
//     be used for all following motifs (see section Motif), when they have
//     to be recalculated to position specific score matrices. This purpose
//     might be the only reason, why the section Sequence  occurs more than
//     once.  Additionally,  the global background  is used by the build-in
//     cutoff calculator (GUI version) for generation of random sequences.
//   * sequenceLengthNormalization  (optional) - the weight for  any usable
//     motif is calculated from its occurrence on the sequences of both the
//     positive and negative training set. During calculation the number of
//     hits on a specific sequence is divided by that sequence' length,  if
//     the parameter was set to 'true'. Default is 'false'.  Note, that the
//     weights  are always  normalized by  the number of sequences  in both
//     training sets.
// 
// - Parameters recognized in section [Motif]
//   * name (obligate) - specifies the motifs name
//   * description (optional) - a short description of the motif
//   * motif (obligate) - the type of the motif, which can be 'SEQUENCE' or
//     'MULTI_SEQUENCE',  where the next lines must  contain of one or more
//     short sequences of the same length  which are calculated either to a
//     RegularExpressionMotif (represented by using the degenerated IUB DNA
//     code) or to a position specific probability matrix (PSPM).
//     As an alternative, 'motif=TABLE_PROB' can be given,  where the motif
//     is represented by a positional probability matrix. The probabilities
//     may as well be occurrence counts,  because the matrix is  either way
//     recalculated to fit the PSPM-table constraints (negative numbers are
//     zeroed, every line has to sum-up to one). If 'TABLE_SCORE' is given,
//     the motif must be represented by a table full of pre-calculated log-
//     odd-scores and is then a PSSM (position specific score matrix).
//     Samples:
//     - motif=SEQUENCE		# the result is NNAAYTTNTT
//       AAAAYTTNTT
//       BBAACTTNTT
//     - motif=MULTI_SEQUENCE
//       #pos    -5..-1  1..4  5..9 (leading, core, trailing)
//       GGCAG CCAT TTTCC
//       CGCAG CCAT TTTCC
//       GTCGG CCAT TAAAA  # found in Fritsch, 1999
//       GGAAG CCAT AACGG
//     - motif=TABLE_PROB	# counted number of occurances are given
//       #pos    A       C       G       T
//       -5      0       16      0       0       # C
//       -3      0       10      0       6       # Y
//       4       0       0       0       16      # T
//       5       1       6       2       7       # N
//   * background  (optional) - given  for motifs of type  'MULTI_SEQUENCE'
//     and 'TABLE_PROB', when the probabilities shall be calculated to log-
//     odd-scores  yielding a PSSM motif  (position specific score matrix).
//     The formula behind for one position and one nucleotide is:
//       ln( ProbabilityInTable / ProbabilityInBackground ).
//     Sample:
//     - background=
//       # A         C        G         T
//       0.283     0.217    0.218     0.282
//   * errorNumberAllowedForMatch  (optional) - given  only for motifs that
//     are of type 'SEQUENCE'. Default is 0.
//   * threshold (optional) - given for motifs of type  'MULTI_SEQUENCE' or
//     'TABLE', respectively. This is either a probability or a score which
//     depends on the choosen representation of the motif (as PSPM or PSSM,
//     respectively). Default is the reachable maximum (sum or product over
//     the maximum of every position).
//   * searchDirection (optional) - specifies the direction of the sequence
//     strand the motif is searched on.  Valid values are 'plus' or 'minus'
//     or 'both' (default), respectively.  'minus' means, that the motif is
//     searched on the sequence after it was reversed and complemented.
//   * weight (optional) - the  motifs weight for  sequence window scoring.
//   * usable (optional) - decides whether the motif is used for searching,
//     weighting and scoring operations (give 'usable=true')  or whether it
//     can only be  used in multi motifs  (default behavior,  equivalent to
//     'usable=false').
// 
// - Parameters recognized in [MultiMotif]
//   * name (optional) - specifies the multi motifs name. If it is not set,
//     it consists of the  single motifs' names with  a colon as delimiter.
//   * description (optional) - a short description of the multi motif.  If
//     it is not specified it consists of the single motifs' names with the
//     distance information inbetween
//   * first_motif (obligate) - specifies the  first motif in the  group by
//     name (motif and name must have been previously defined).
//   * next_motif  (optional) - after allowed  minimal and maximal distance
//     to the previous motif follows a motif name (delimiter is comma).  If
//     this parameter occurs more then once the order is importent.
//     Sample:
//     - first_motif=GAGAG
//       next_motif=0,1000,Zeste
//       next_motif=0,500,GAGAG
//   * weight (optional) - the  motifs weight for  sequence window scoring.
//   * usable (optional) - decides whether the motif is used for searching,
//     weighting and  scoring operations  (default behavior,  equivalent to
//     'usable=true') or whether  it is only re-used in  other multi motifs
//     (in this case give 'usable=false').
// 
// - Parameters recognized in [MultiMotifList]
//   * distance (obligate) -  defines minimal and maximal distance  between
//     successive motifs. That distance is used in all multi motifs defined
//     in this section. Delimiter is the comma.
//     Sample:
//     - distance=0,220
//   * followed by the 'distance' parameter come one or more lines with the
//     names of motifs, which have to be combined into multi motifs. If the
//     names in a line are separated by comma, the corresponding motifs are
//     combined into as much DoubleMotifs as possible (self-combination and
//     forth permutation are allowed). Lines with names separated by colons
//     are directly treated as one multi motif.
//     Sample:
//     - GAGAG:Zeste:PHO	# One MultiMotif of 3 single motifs
//       GAGAG,Zeste,PHO	# this line is the same as the following 6 lines
//       GAGAG:GAGAG
//       GAGAG:Zeste
//       GAGAG:PHO
//       Zeste:Zeste
//       Zeste:PHO
//       PHO:PHO
// 


[Motif]
name        = PHO
description = PHO core with leading and trailing sequence parts
motif       = MULTI_SEQUENCE

#pos    -5..-1  1..4  5..9 (leading, core, trailing)
GGCAG CCAT TTTCC  # Brown, 2003; 
CGCAG CCAT TTTCC  # Kassis, 1989; Dvir engrailed gene, -725..-712
GTCGG CCAT TAAAA  # Fritsch, 1999 (text); PBX region
GGAAG CCAT AACGG  # Fritsch, 1999 (Fig.2); 

background  = # this is the background from D.mel
0.287 0.213 0.213 0.287
threshold   = 14.2

[Motif]
name        = G10
description = the long GAGA bining site of size 10
motif       = SEQUENCE
GAGAGAGAGA
errorNumberAllowedForMatch = 1  # one mismatch allowed

[Motif]
name        = PHO core
motif       = SEQUENCE
GCCAT

[Motif]
name        = Zeste
description = Zeste
motif       = TABLE_PROB
#pos    A       C       G       T
-3      0       8       0       8       # Y
-2      0       0       10      0       # G
-1      10      0       0       0       # A
1       0       0       10      0       # G
2       0       8       0       8       # Y
3       0       0       10      0       # G


[MultiMotifList]
distance=0,219
Zeste,G10, PHO core

[Sequence]
sequence_filename=/homes/tfiedler/a/sFreqNuc/new_pre.fasta
positive_training_set_filename=/homes/tfiedler/a/sFreqNuc/new_pre.fasta
negative_training_set_filename=/homes/tfiedler/a/sFreqNuc/non_pre.fasta
