Pages

Tuesday, July 30, 2013

CTR #4: Working with SD tag data

Next in the Chemistry Toolkit Rosetta CDK/Groovy solutions series is how to work with SD tag data. This CTR problem asks to scan the content of a number of SD fields, process them, and summarize them in a new field. Here's the CDK/Groovy solution, taking advantage of Groovy's grab functionality:

@GrabResolver(
  name='idea',
  root='http://ambit.uni-plovdiv.bg:8083/nexus/content/repositories/thirdparty/'
)
@Grapes([
  @Grab(
    group='org.openscience.cdk',
    module='cdk-io',
    version='1.4.11'
  ),
  @Grab(
    group='org.openscience.cdk',
    module='cdk-silent',
    version='1.4.11' 
  )
])
 
import org.openscience.cdk.io.*;
import org.openscience.cdk.io.iterator.*;
import org.openscience.cdk.silent.*;
import java.util.zip.GZIPInputStream;
 
iterator = new IteratingMDLReader(
  new GZIPInputStream(
    new File("benzodiazepine.sdf.gz")
      .newInputStream()
  ),
  SilentChemObjectBuilder.getInstance()
)
writer = new SDFWriter(
  new FileWriter("RULES5.sdf")
)
while (iterator.hasNext()) {
  mol = iterator.next()
  if (mol.getProperty("PUBCHEM_XLOGP3") == null) {
    mol.setProperty("RULE5", "no logP")
  } else {
    ruleCount = 0;
    if (Integer.valueOf(mol.getProperty("PUBCHEM_CACTVS_HBOND_ACCEPTOR")) <= 10)
      ruleCount++
    if (Integer.valueOf(mol.getProperty("PUBCHEM_CACTVS_HBOND_DONOR")) <= 5)
      ruleCount++
    if (Double.valueOf(mol.getProperty("PUBCHEM_MOLECULAR_WEIGHT")) <= 500.0)
      ruleCount++
    if (Double.valueOf(mol.getProperty("PUBCHEM_XLOGP3")) <= 5.0)
      ruleCount++
    mol.setProperty("RULE5", ruleCount >= 3  ? "1" : "0")
    writer.write(mol)
  }
}
writer.close()