/**********************************************************************
obgrep - Open Babel molecule grep using SMARTS.
Copyright (C) 2003 Fabien Fontaine
Some portions Copyright (C) 2004-2005 Geoffrey R. Hutchison
This file is part of the Open Babel project.
For more information, see <http://openbabel.sourceforge.net/>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation version 2 of the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
***********************************************************************/
// used to set import/export for Cygwin DLLs
#ifdef WIN32
#define USING_OBDLL
#endif
#include <openbabel/babelconfig.h>
#include <openbabel/mol.h>
#include <openbabel/obconversion.h>
#include <openbabel/parsmart.h>
#ifdef _WIN32
typedef char TCHAR;
#include "XGetOpt.h"
#else
#include <unistd.h>
#endif
using namespace std;
using namespace OpenBabel;
///////////////////////////////////////////////////////////////////////////////
//! \brief Find the molecule(s) with or without a given SMART pattern
int main(int argc,char **argv)
{
char c;
unsigned int ntimes=0; // number of times SMARTS matches in a molecule
unsigned int numMatching = 0; // number of matching molecules (for -c flag)
bool pattern_matched=false, ntimes_matched=true;
bool count=false, invert=false, full=false, name_only=false;
char *FileIn = NULL, *Pattern = NULL;
char *program_name = argv[0];
char *iext;
bool useInFile = true;
OBConversion conv(&cin,&cout);
OBFormat *pFormat = conv.FindFormat("smi"); // default format is SMILES
// Parse options
while ((c = getopt(argc, argv, "t:nvcfi:-")) != -1)
{
#ifdef _WIN32
char optopt = c;
#endif
switch (c)
{
case 't': // request ntimes unique matches
c = sscanf(optarg, "%d", &ntimes);
if (c != 1 )
{
cerr << program_name << ": unable to parse -t option" << endl;
exit (-1);
}
break;
case 'i':
iext = optarg;
//The ID provided by the OBFormat class is used as
// the identifying file extension. This is a slight
// reduction in flexibility (which is not currently used)
pFormat = conv.FindFormat(iext);
if(pFormat==NULL)
{
cerr << program_name << ": cannot read input format!" << endl;
exit(-1);
}
break;
case 'n': // print the molecule name only
name_only = true;
break;
case 'c': // count the number of match
count = true;
break;
case 'v': // match only the molecules without the pattern
invert = true;
break;
case 'f':
full = true;
break;
case '-':
useInFile = false;
break;
case '?':
if (isprint (optopt))
fprintf (stderr, "Unknown option `-%c'.\n", optopt);
else
fprintf (stderr,
"Unknown option character `\\x%x'.\n",
optopt);
return 1;
}
}
int index = optind;
if (argc-index != 2 && argc-index != 1)
{
string err = "Usage: ";
err += program_name;
err += " [options] \"PATTERN\" <filename>\n";
err += "If no filename is supplied, then obgrep will use stdin instead.\n";
err += "Options:\n";
err += " -v Invert the matching, print non-matching molecules\n";
err += " -c Print the number of matched molecules\n";
err += " -i <format> Specify the input and output format\n";
err += " -f Full match, print matching-molecules when the number\n";
err += " of heavy atoms is equal to the number of PATTERN atoms\n";
err += " -n Only print the name of the molecules\n";
err += " -t NUM Print a molecule only if the PATTERN occurs NUM times inside the molecule\n";
cerr << err << ends;
exit(-1);
}
else
{
Pattern = argv[index++];
if (argc - index == 1)
FileIn = argv[index];
}
ifstream ifs;
if (useInFile && FileIn != NULL)
{
// Read the file
ifs.open(FileIn);
if (!ifs)
{
cerr << program_name << ": cannot read input file!" << endl;
exit (-1);
}
conv.SetInStream(&ifs);
// Find Input filetype
pFormat = conv.FormatFromExt(FileIn);
if (pFormat == NULL)
{
cerr << program_name << ": cannot read input format!" << endl;
return (-1);
}
}
if (! conv.SetInAndOutFormats(pFormat, pFormat))
{
cerr << program_name << ": cannot read or write to this file format" << endl;
return (-1);
}
// Match the SMART
OBSmartsPattern sp;
vector< vector <int> > maplist; // list of matched atoms
sp.Init(Pattern);
OBMol mol;
bool impossible_match;
// Search for pattern
for (c=0;;)
{
mol.Clear();
conv.Read(&mol);
if (mol.Empty())
break;
////////////////////////////////////////////////////////////////
// Do not loose time trying to match the pattern if the matching
// is impossible.
// It is impossible to make a full match if the number of atoms is
// different
if (full )
impossible_match = (sp.NumAtoms() == mol.NumHvyAtoms()) ? false : true;
else
impossible_match = false;
if (impossible_match)
{ // -> avoid useless SMART matching attempt
if (invert)
{
if (!count)
{
if ( name_only )
cout << mol.GetTitle() << endl;
else
conv.Write(&mol, &cout);
}
numMatching++;
}
continue;
}
////////////////////////////////////////////////////////////////
// perform SMART matching
pattern_matched = sp.Match(mol);
// the number of times the match occured may matter
if ( ntimes )
{ // ntimes is a positive integer of requested matches
// Here, a match mean a unique match (same set of atoms)
// so we need to get the unique match list size
maplist = sp.GetUMapList();
if( maplist.size() == ntimes )
ntimes_matched = true;
else
ntimes_matched = false;
}
else
{ // ntimes == 0, we don't care about the number of matches
ntimes_matched = true;
}
////////////////////////////////////////////////////////////////
// perform a set of tests to guess what to print out
if ( pattern_matched == true && ntimes_matched == true)
{
if (!invert)
{ // do something only when invert flag is off
if (!count)
{
if ( name_only )
cout << mol.GetTitle() << endl;
else
conv.Write(&mol, &cout);
}
numMatching++;
}
}
else
{ // The SMART pattern do not occur as many times as requested
if (invert)
{ // do something only if invert flag is on
if (!count)
{
if ( name_only )
cout << mol.GetTitle() << endl;
else
conv.Write(&mol, &cout);
}
numMatching++;
}
}
} // end for loop
////////////////////////////////////////////////////////////////
// Only print the number of matched molecules as requested
if (count)
{
cout << numMatching << endl;
}
return(1);
}
/* obgrep man page*/
/** \page obgrep an advanced SMARTS grep program
*
* \n
* \par SYNOPSIS
*
* \b obgrep [options] '<SMARTS-pattern>' \<filename\>
*
* \par DESCRIPTION
*
* The obgrep tool can be used to search for molecules inside multi-molecule
* database files (e.g., SMILES, SDF, etc.).
*
* \par OPTIONS
*
* If only a filename is given, obgrep will attempt to guess
* the file type from the filename extension. \n\n
*
* \b -c:
* Print the number of matches \n\n
* \b -f:
* Full match, print matching-molecules only when the number
* of heavy atoms is also equal to the number of atoms in the
* SMARTS pattern \n\n
* \b -i \<format\>:
* Specifies input and output format, see "babel" for available formats \n\n
* \b -n:
* Only print the name of the molecules\n\n
* \b -t \<NUM\>:
* Print a molecule only if the pattern occurs NUM times inside the molecule\n\n
* \b -v:
* Invert the matching, print non-matching molecules \n\n
*
* \par EXAMPLES
* - Print all the molecules with a methylamine group: \n
* obgrep "CN" database.smi
* - Print all the molecules without a methylamine group: \n
* obgrep -v "CN" database.smi
* - Print the number of molecules without a methylamine group: \n
* obgrep -v -c "CN" database.smi
* - Print methylamine (if it exists in the file): \n
* obgrep -f "CN" database.smi
* - Print methylamine and/or methanol (if they exist): \n
* obgrep -f "C[N,O]" database.smi
*
* \par AUTHORS
*
* The obgrep program was contributed by \b Fabien \b Fontaine.
*
* Open Babel is currently maintained by \b Geoff \b Hutchison, \b Chris \b Morley and \b Michael \b Banck.
*
* For more contributors to Open Babel, see http://openbabel.sourceforge.net/THANKS.shtml
*
* \par COPYRIGHT
* Copyright (C) 1998-2001 by OpenEye Scientific Software, Inc.
* Some portions Copyright (C) 2001-2005 by Geoffrey R. Hutchison \n \n
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation version 2 of the License.\n \n
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* \par SEE ALSO
* The web pages for Open Babel can be found at: http://openbabel.sourceforge.net/ \n
* A guide for constructing SMARTS patterns can be found at: http://www.daylight.com/dayhtml/doc/theory/theory.smarts.html
**/
syntax highlighted by Code2HTML, v. 0.9.1