root/trunk/components/server/src/ome/services/fulltext/FileParser.java
| Revision 2786, 8.3 kB (checked in by jmoore, 4 months ago) |
|---|
| Line | |
|---|---|
| 1 | /* |
| 2 | * $Id$ |
| 3 | * |
| 4 | * Copyright 2008 Glencoe Software, Inc. All rights reserved. |
| 5 | * Use is subject to license terms supplied in LICENSE.txt |
| 6 | */ |
| 7 | |
| 8 | package ome.services.fulltext; |
| 9 | |
| 10 | import java.io.BufferedReader; |
| 11 | import java.io.File; |
| 12 | import java.io.FileReader; |
| 13 | import java.io.Reader; |
| 14 | import java.util.Iterator; |
| 15 | import java.util.NoSuchElementException; |
| 16 | |
| 17 | import ome.services.messages.RegisterServiceCleanupMessage; |
| 18 | import ome.system.OmeroContext; |
| 19 | |
| 20 | import org.apache.commons.logging.Log; |
| 21 | import org.apache.commons.logging.LogFactory; |
| 22 | import org.springframework.beans.BeansException; |
| 23 | import org.springframework.context.ApplicationContext; |
| 24 | import org.springframework.context.ApplicationContextAware; |
| 25 | |
| 26 | /** |
| 27 | * Object which attempts to parse any file given to it. On an exception or |
| 28 | * empty/missing file, an empty {@link Iterable<String>} should be returned |
| 29 | * rather than throwing an exception. |
| 30 | * |
| 31 | * Subclases should follow |
| 32 | * |
| 33 | * @author Josh Moore, josh at glencoesoftware.com |
| 34 | * @since 3.0-Beta3 |
| 35 | */ |
| 36 | public class FileParser implements ApplicationContextAware { |
| 37 | |
| 38 | private final static Log log = LogFactory.getLog(FileParser.class); |
| 39 | |
| 40 | protected OmeroContext context; |
| 41 | |
| 42 | public void setApplicationContext(ApplicationContext arg0) |
| 43 | throws BeansException { |
| 44 | context = (OmeroContext) arg0; |
| 45 | } |
| 46 | |
| 47 | /** |
| 48 | * {@link Iterable} which returns an empty {@link Iterator}. This will be |
| 49 | * used in case |
| 50 | */ |
| 51 | public final static Iterable<Reader> EMPTY = new Iterable<Reader>() { |
| 52 | public Iterator<Reader> iterator() { |
| 53 | return new Iterator<Reader>() { |
| 54 | public boolean hasNext() { |
| 55 | return false; |
| 56 | } |
| 57 | |
| 58 | public Reader next() { |
| 59 | throw new NoSuchElementException(); |
| 60 | } |
| 61 | |
| 62 | public void remove() { |
| 63 | throw new UnsupportedOperationException(); |
| 64 | } |
| 65 | }; |
| 66 | } |
| 67 | }; |
| 68 | |
| 69 | /** |
| 70 | * Uses {@link #doParse(File)} to create manageable chunks of a file for |
| 71 | * indexing. If the {@link File} argument is null or unreadable, then the |
| 72 | * {@link #EMPTY} {@link Iterable} will be returned. The same holds if a |
| 73 | * null {@link Iterable} is returned or an {@link Exception} is thrown. |
| 74 | * |
| 75 | * The {@link Iterator} returned from the instance should always be |
| 76 | * completely iterated through so that resources can be released. For |
| 77 | * example, <code> |
| 78 | * for (String string : parse(file)) { |
| 79 | * /* possibly ignore string *\/ |
| 80 | * } |
| 81 | * </code> |
| 82 | * |
| 83 | * @param file |
| 84 | * Can be null. |
| 85 | * @return An {@link Iterable} which is never null. |
| 86 | */ |
| 87 | final public Iterable<Reader> parse(File file) { |
| 88 | |
| 89 | if (file == null) { |
| 90 | log.warn("Argument null. Returning EMPTY:"); |
| 91 | return EMPTY; |
| 92 | } |
| 93 | |
| 94 | try { |
| 95 | Iterable<Reader> it = doParse(file); |
| 96 | if (it == null) { |
| 97 | log.debug("Implementation returned null."); |
| 98 | return EMPTY; |
| 99 | } else { |
| 100 | return it; |
| 101 | } |
| 102 | } catch (Exception e) { |
| 103 | log.warn("Implementation threw an exception.", e); |
| 104 | return EMPTY; |
| 105 | } |
| 106 | |
| 107 | } |
| 108 | |
| 109 | /** |
| 110 | * Template method to parse a {@link File} into manageable chunks. |
| 111 | * |
| 112 | * The default implementation reads from the file lazily with chunks |
| 113 | * overlapping on the final white space. For example a file with: |
| 114 | * <code>The quick brown fox jumps over the lazy dog</code> might be |
| 115 | * parsed to: <code>The quick brown fox jumps</code> and |
| 116 | * <code>jumps over the lazy dog</code>. |
| 117 | * |
| 118 | * Receives a non-null, {@link File#canRead() readable} {@link File} |
| 119 | * instance from {@link #parse(File)} and can return a possible null |
| 120 | * {@link Iterable} or throw an {@link Exception}. |
| 121 | * |
| 122 | * In any of the non-successful cases, the {@link #EMPTY} {@link Iterable} |
| 123 | * will be returned to the consumer. |
| 124 | */ |
| 125 | public Iterable<Reader> doParse(File file) throws Exception { |
| 126 | FileReader reader = new FileReader(file); |
| 127 | BufferedReader buffered = new BufferedReader(reader); |
| 128 | context.publishEvent(new RegisterServiceCleanupMessage(this, buffered) { |
| 129 | @Override |
| 130 | public void close() { |
| 131 | try { |
| 132 | Reader r = (Reader) resource; |
| 133 | r.close(); |
| 134 | } catch (Exception e) { |
| 135 | log.debug("Error closing " + resource, e); |
| 136 | } |
| 137 | } |
| 138 | }); |
| 139 | Iterator<Reader> it = new SingleIterator(buffered); |
| 140 | return wrap(it); |
| 141 | } |
| 142 | |
| 143 | /** |
| 144 | * Wraps an {@link Iterator} with an {@link Iterable} instance. If the |
| 145 | * {@link Iterator} is null, the {@link #EMPTY} {@link Iterable} will be |
| 146 | * returned. |
| 147 | * |
| 148 | * @param it |
| 149 | * Can be null. |
| 150 | * @return Will never be null |
| 151 | */ |
| 152 | public Iterable<Reader> wrap(Iterator<Reader> it) { |
| 153 | if (it == null) { |
| 154 | return EMPTY; |
| 155 | } |
| 156 | return new IteratorWrapper(it); |
| 157 | } |
| 158 | |
| 159 | public Iterable<Reader> wrap(Reader r) { |
| 160 | if (r == null) { |
| 161 | return EMPTY; |
| 162 | } |
| 163 | return wrap(new SingleIterator(r)); |
| 164 | } |
| 165 | |
| 166 | private static class SingleIterator implements Iterator<Reader> { |
| 167 | |
| 168 | Reader r; |
| 169 | |
| 170 | SingleIterator(Reader r) { |
| 171 | this.r = r; |
| 172 | } |
| 173 | |
| 174 | public boolean hasNext() { |
| 175 | return r != null; |
| 176 | } |
| 177 | |
| 178 | public Reader next() { |
| 179 | Reader rv = r; |
| 180 | r = null; |
| 181 | return rv; |
| 182 | } |
| 183 | |
| 184 | public void remove() { |
| 185 | throw new UnsupportedOperationException(); |
| 186 | } |
| 187 | |
| 188 | } |
| 189 | |
| 190 | private static class IteratorWrapper implements Iterable<Reader> { |
| 191 | |
| 192 | private final Iterator<Reader> it; |
| 193 | |
| 194 | public IteratorWrapper(Iterator<Reader> it) { |
| 195 | this.it = it; |
| 196 | } |
| 197 | |
| 198 | public Iterator<Reader> iterator() { |
| 199 | return it; |
| 200 | } |
| 201 | } |
| 202 | |
| 203 | private static class OverlappingChunkFileIterator implements |
| 204 | Iterator<String> { |
| 205 | |
| 206 | private static final String linesep = System |
| 207 | .getProperty("line.separator"); |
| 208 | |
| 209 | private static final int size = 10000; |
| 210 | |
| 211 | private final long fileSize; |
| 212 | |
| 213 | private final char[] buf; |
| 214 | |
| 215 | private String next; |
| 216 | |
| 217 | /* |
| 218 | * will be closed nulled out when finished. |
| 219 | */ |
| 220 | private BufferedReader reader; |
| 221 | |
| 222 | public OverlappingChunkFileIterator(File file) throws Exception { |
| 223 | this.fileSize = file.length(); |
| 224 | if (fileSize > Integer.MAX_VALUE) { |
| 225 | throw new RuntimeException(String.format( |
| 226 | "%s file is too large for current implementation: %s", |
| 227 | file, fileSize)); |
| 228 | } |
| 229 | this.reader = new BufferedReader(new FileReader(file), size); |
| 230 | this.buf = new char[size]; |
| 231 | } |
| 232 | |
| 233 | public boolean hasNext() { |
| 234 | |
| 235 | if (next == null) { |
| 236 | next = doRead(); |
| 237 | } |
| 238 | return next != null; |
| 239 | } |
| 240 | |
| 241 | public String next() { |
| 242 | |
| 243 | if (!hasNext()) { // does doRead() |
| 244 | throw new NoSuchElementException(); |
| 245 | } |
| 246 | String rv = next; |
| 247 | next = null; |
| 248 | return rv; |
| 249 | } |
| 250 | |
| 251 | public void remove() { |
| 252 | throw new UnsupportedOperationException(); |
| 253 | } |
| 254 | |
| 255 | /** |
| 256 | * Intermediate method which parses whole file into a single String. |
| 257 | * Please see the restriction in the constructor on filesize. |
| 258 | */ |
| 259 | private String doRead() { |
| 260 | |
| 261 | if (reader == null) { |
| 262 | return null; |
| 263 | } |
| 264 | |
| 265 | StringBuffer sb = new StringBuffer((int) fileSize); |
| 266 | |
| 267 | int rv = -1; |
| 268 | try { |
| 269 | while ((rv = reader.read(buf)) != -1) { |
| 270 | sb.append(buf, 0, rv); |
| 271 | } |
| 272 | } catch (Exception e) { |
| 273 | throw new RuntimeException("Error while parsing file", e); |
| 274 | } |
| 275 | closeReader(); |
| 276 | return sb.toString(); |
| 277 | |
| 278 | } |
| 279 | |
| 280 | private void closeReader() { |
| 281 | if (reader != null) { |
| 282 | try { |
| 283 | reader.close(); |
| 284 | } catch (Exception e) { |
| 285 | // must ignore |
| 286 | } finally { |
| 287 | reader = null; |
| 288 | } |
| 289 | } |
| 290 | } |
| 291 | } |
| 292 | } |
Note: See TracBrowser
for help on using the browser.
