ValidateLexiconEntities.java from Texai at Krugle
Show ValidateLexiconEntities.java syntax highlighted
/*
* ValidateLexiconEntities.java
*
* Created on August 29, 2007, 12:26 PM
*
* Description: Validates the lexicon repository contents.
*
* Copyright (C) August 29, 2007 Stephen L. Reed.
*
* This program is free software; you can redistribute it and/or modify it under the terms
* of the GNU General Public License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
* See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along with this program;
* if not, write to the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
package org.texai.lexicon.domainEntity;
import java.io.File;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import net.jcip.annotations.GuardedBy;
import net.jcip.annotations.Immutable;
import net.jcip.annotations.NotThreadSafe;
import net.sf.ehcache.CacheManager;
import org.apache.log4j.Logger;
import org.openrdf.OpenRDFException;
import org.openrdf.model.URI;
import org.openrdf.query.MalformedQueryException;
import org.openrdf.query.QueryLanguage;
import org.openrdf.query.TupleQuery;
import org.openrdf.query.TupleQueryResult;
import org.openrdf.repository.Repository;
import org.openrdf.repository.RepositoryConnection;
import org.openrdf.repository.RepositoryException;
import org.openrdf.repository.sail.SailRepository;
import org.openrdf.sail.nativerdf.NativeStore;
import org.texai.grammar.domainEntity.AlternativeConstruction;
import org.texai.kb.CacheInitializer;
import org.texai.kb.Constants;
import org.texai.kb.persistence.RDFEntityManager;
import org.texai.util.TexaiException;
/** Validates the lexicon repository contents.
*
* @author reed
*/
@NotThreadSafe
public class ValidateLexiconEntities {
/** the number of validation threads */
private static final int NBR_THREADS = 2;
/** the logger */
private final Logger LOGGER = Logger.getLogger(ValidateLexiconEntities.class);
/** the Sesame repository */
private Repository repository;
/** the executor */
private final ExecutorService executor;
/** the list of URIs to validate */
private final Set<URI> urisToValidate = new HashSet<URI>();
/** the iterator of URIs to validate */
@GuardedBy("itself")
private Iterator<URI> urisToValidate_iter;
/** the number of validated entities so far */
@GuardedBy("nbrURIsValidated_lock")
private volatile int nbrURIsValidated = 0;
/** the lock for the number of validated entities so far */
private Object nbrURIsValidated_lock = new Object();
/** the class URIs for entities to validate */
private String[] classURIStrings = {
"http://texai.org/texai/org.texai.lexicon.domainEntity.TexaiEnglishWord",
"http://texai.org/texai/org.texai.lexicon.domainEntity.TexaiEnglishWordForm",
"http://texai.org/texai/org.texai.lexicon.domainEntity.TexaiEnglishWordSense",
"http://texai.org/texai/org.texai.lexicon.domainEntity.TexaiSamplePhrase"
};
/**
* Creates a new instance of ValidateLexiconEntities.
*/
public ValidateLexiconEntities() {
executor = Executors.newFixedThreadPool(NBR_THREADS);
}
/** Initializes this application. */
public void initialize() {
CacheInitializer.initializeCaches();
getClass().getClassLoader().setDefaultAssertionStatus(true); // optional
CacheInitializer.initializeCaches();
// final File dataDirectory = new File(System.getProperties().getProperty("user.home") + "/.aduna/openrdf-sesame/repositories/TexaiEnglishLexicon");
final File dataDirectory = new File("/mnt/tmpfs/repositories/TexaiEnglishLexicon");
LOGGER.info("accessing the Sesame2 repository in " + dataDirectory.toString());
final String indices = "spoc,posc";
repository = new SailRepository(new NativeStore(dataDirectory, indices));
try {
repository.initialize();
} catch (final RepositoryException ex) {
throw new TexaiException(ex);
}
}
/** Validates the CMU dictionary RDF entities. */
public void validate() {
for (final String classURIString : classURIStrings) {
validateRDFEntities(classURIString);
}
}
/** Validates the RDF entities having the given class URI.
*
* @param classURI the class URI
*/
private void validateRDFEntities(final String classURIString) {
LOGGER.info("");
LOGGER.info("querying the entity URIs having class URI " + classURIString);
try {
final RepositoryConnection repositoryConnection = repository.getConnection();
final String queryString =
"SELECT s FROM {s} rdf:type {<" + classURIString + ">}";
LOGGER.info("query: " + queryString);
final TupleQuery tupleQuery = repositoryConnection.prepareTupleQuery(QueryLanguage.SERQL, queryString);
final TupleQueryResult tupleQueryResult = tupleQuery.evaluate();
urisToValidate.clear();
while (tupleQueryResult.hasNext()) {
urisToValidate.add((URI) tupleQueryResult.next().getBinding("s").getValue());
}
tupleQueryResult.close();
if (LOGGER.isDebugEnabled()) {
LOGGER.info("closing the query repository connection");
}
repositoryConnection.close();
if (urisToValidate.isEmpty()) {
LOGGER.info("*** no entity URIs selected " + classURIString);
return;
}
} catch (final MalformedQueryException ex) {
throw new TexaiException(ex);
} catch (final RepositoryException ex) {
throw new TexaiException(ex);
} catch (final OpenRDFException ex) {
throw new TexaiException(ex);
}
nbrURIsValidated = 0;
final int urisToValidate_size = urisToValidate.size();
LOGGER.info("found " + urisToValidate_size + " entity URIs");
urisToValidate_iter = urisToValidate.iterator();
final long startMillis = System.currentTimeMillis();
final CountDownLatch doneSignal = new CountDownLatch(NBR_THREADS);
for (int i = 0; i < NBR_THREADS; i++) {
executor.execute(new EntityValidationRunnable(classURIString, doneSignal, i + 1));
}
try {
doneSignal.await();
} catch (InterruptedException ex) {
throw new TexaiException(ex);
}
double secondsDuration = (float) ((System.currentTimeMillis() - startMillis)) / 1000.0d;
if (secondsDuration == 0) {
secondsDuration = 1;
}
LOGGER.info("validated " + urisToValidate_size + " at the rate of " + urisToValidate_size / secondsDuration + " per second");
}
/** A parallel runnable that validates entity URIs which it obtains from the shared iterator. */
@Immutable
class EntityValidationRunnable implements Runnable {
/** the entity class URI string */
private final String classURIString;
/** the count down latch that synchronizes the calling thread */
private final CountDownLatch doneSignal;
/** the thread id */
private final int id;
/** Constructs a new EntityValidationRunnable instance.
*
* @param classURIString the entity class URI string
* @param doneSignal the count down latch that synchronizes the calling thread
* @param id the identification for this runnable
*/
public EntityValidationRunnable(final String classURIString, final CountDownLatch doneSignal, final int id) {
//Preconditions
assert doneSignal != null : "doneSignal must not be null";
assert classURIString != null : "classURIString must not be null";
assert !classURIString.isEmpty() : "classURIString must not be empty";
this.classURIString = classURIString;
this.doneSignal = doneSignal;
this.id = id;
}
/** Executes this thread. */
public void run() {
RDFEntityManager rdfEntityManager = null;
try {
LOGGER.info("starting " + id);
Thread.currentThread().setName("validator " + id);
rdfEntityManager = new RDFEntityManager(repository);
int nbrURIsProcessed = 0;
boolean isDone = false;
while (! isDone) {
final URI uri = getNextEntityURIToValidate();
if (uri == null) {
isDone = true;
} else {
boolean isLogged = false;
synchronized (nbrURIsValidated_lock) {
nbrURIsValidated++;
if (nbrURIsValidated % 2500 == 0) {
isLogged = true;
CacheInitializer.resetCache(Constants.CACHE_CONNECTED_RDF_ENTITIES);
}
}
if ("http://texai.org/texai/org.texai.lexicon.domainEntity.TexaiEnglishWord".equals(classURIString)) {
validateTexaiEnglishWord(rdfEntityManager, uri, isLogged);
} else if ("http://texai.org/texai/org.texai.lexicon.domainEntity.TexaiEnglishWordForm".equals(classURIString)) {
validateTexaiEnglishWordForm(rdfEntityManager, uri, isLogged);
} else if ("http://texai.org/texai/org.texai.lexicon.domainEntity.TexaiEnglishWordSense".equals(classURIString)) {
validateTexaiEnglishWordSense(rdfEntityManager, uri, isLogged);
} else if ("http://texai.org/texai/org.texai.lexicon.domainEntity.TexaiSamplePhrase".equals(classURIString)) {
validateTexaiSamplePhrase(rdfEntityManager, uri, isLogged);
}
nbrURIsProcessed++;
}
}
LOGGER.info("Thread " + id + " completed " + nbrURIsProcessed + " entity URIs");
doneSignal.countDown();
} catch (final Exception ex) {
LOGGER.error(ex.getMessage(), ex);
ex.printStackTrace();
} finally {
if (rdfEntityManager != null) {
rdfEntityManager.close();
}
}
}
}
/** Validates the TexaiEnglishWord entity.
*
* @param rdfEntityManager the RDF entity manager
* @param uri the RDF entity URI
* @param isLogged the indicator whether to log this entity
*/
private void validateTexaiEnglishWord(final RDFEntityManager rdfEntityManager, final URI uri, final boolean isLogged) {
//Preconditions
assert rdfEntityManager != null : "rdfEntityManger must not be null";
assert uri != null : "uri must not be null";
final TexaiEnglishWord texaiEnglishWord = (TexaiEnglishWord) rdfEntityManager.find(TexaiEnglishWord.class, uri);
if (isLogged) {
LOGGER.info(texaiEnglishWord.getLemma() + " by thread " + Thread.currentThread().getName());
}
// the id
if (!texaiEnglishWord.getId().equals(uri)) {
LOGGER.warn(texaiEnglishWord + " ids are not equal");
} else if (LOGGER.isDebugEnabled()) {
LOGGER.debug("id OK for " + texaiEnglishWord.getId());
}
//TODO
}
/** Validates the TexaiEnglishWordForm entity.
*
* @param rdfEntityManager the RDF entity manager
* @param uri the RDF entity URI
* @param isLogged the indicator whether to log this entity
*/
private void validateTexaiEnglishWordForm(final RDFEntityManager rdfEntityManager, final URI uri, final boolean isLogged) {
//Preconditions
assert rdfEntityManager != null : "rdfEntityManger must not be null";
assert uri != null : "uri must not be null";
final TexaiEnglishWordForm texaiEnglishWordForm = (TexaiEnglishWordForm) rdfEntityManager.find(TexaiEnglishWordForm.class, uri);
if (isLogged) {
LOGGER.info(texaiEnglishWordForm.getWordForm() + " by thread " + Thread.currentThread().getName());
}
// the id
if (!texaiEnglishWordForm.getId().equals(uri)) {
LOGGER.warn(texaiEnglishWordForm + " ids are not equal");
} else if (LOGGER.isDebugEnabled()) {
LOGGER.debug("id OK for " + texaiEnglishWordForm.getId());
}
//TODO
}
/** Validates the TexaiEnglishWordSense entity.
*
* @param rdfEntityManager the RDF entity manager
* @param uri the RDF entity URI
* @param isLogged the indicator whether to log this entity
*/
private void validateTexaiEnglishWordSense(final RDFEntityManager rdfEntityManager, final URI uri, final boolean isLogged) {
//Preconditions
assert rdfEntityManager != null : "rdfEntityManger must not be null";
assert uri != null : "uri must not be null";
final TexaiEnglishWordSense texaiEnglishWordSense = (TexaiEnglishWordSense) rdfEntityManager.find(TexaiEnglishWordSense.class, uri);
if (isLogged) {
LOGGER.info(texaiEnglishWordSense.toString() + " by thread " + Thread.currentThread().getName());
}
// the id
if (!texaiEnglishWordSense.getId().equals(uri)) {
LOGGER.warn(texaiEnglishWordSense + " ids are not equal");
} else if (LOGGER.isDebugEnabled()) {
LOGGER.debug("id OK for " + texaiEnglishWordSense.getId());
}
//TODO
}
/** Validates the TexaiSamplePhrase entity.
*
* @param rdfEntityManager the RDF entity manager
* @param uri the RDF entity URI
* @param isLogged the indicator whether to log this entity
*/
private void validateTexaiSamplePhrase(final RDFEntityManager rdfEntityManager, final URI uri, final boolean isLogged) {
//Preconditions
assert rdfEntityManager != null : "rdfEntityManger must not be null";
assert uri != null : "uri must not be null";
final TexaiSamplePhrase texaiSamplePhrase = (TexaiSamplePhrase) rdfEntityManager.find(TexaiSamplePhrase.class, uri);
if (isLogged) {
LOGGER.info(texaiSamplePhrase.toString() + " by thread " + Thread.currentThread().getName());
}
// the id
if (!texaiSamplePhrase.getId().equals(uri)) {
LOGGER.warn(texaiSamplePhrase + " ids are not equal");
} else if (LOGGER.isDebugEnabled()) {
LOGGER.debug("id OK for " + texaiSamplePhrase.getId());
}
//TODO
}
/** Gets the next entity URI to validate.
*
* @return the next WordNet synset URI to validate, or null when done
*/
private URI getNextEntityURIToValidate() {
synchronized(urisToValidate_iter) {
if (urisToValidate_iter.hasNext()) {
return urisToValidate_iter.next();
} else {
return null;
}
}
}
/** Finalizes this application. */
public void finalization() {
executor.shutdown();
CacheManager.getInstance().shutdown();
try {
repository.shutDown();
} catch (final RepositoryException ex) {
throw new TexaiException(ex);
}
LOGGER.info("ValidateLexiconEntities completed");
}
/** Executes this application.
*
* @param args the command line arguments (unused)
*/
public static void main(final String[] args) {
final ValidateLexiconEntities validateCMUPronouncingDictionaryEntities = new ValidateLexiconEntities();
validateCMUPronouncingDictionaryEntities.initialize();
validateCMUPronouncingDictionaryEntities.validate();
validateCMUPronouncingDictionaryEntities.finalization();
System.exit(0);
}
}
See more files for this project here