001package com.randomnoun.common.db;
002
003import java.io.IOException;
004import java.io.InputStream;
005import java.io.InputStreamReader;
006import java.io.Reader;
007import java.text.ParseException;
008import java.util.ArrayList;
009import java.util.List;
010import java.util.regex.Matcher;
011import java.util.regex.Pattern;
012
013/** A class to read an InputStream containing SQL statements (e.g. a MySQL input file) and split it into 
014 * individual SQL statements.
015 * 
016 * <p>Some MySQL command-line directives (e.g. 'DELIMITER') are processed whilst parsing. 
017 */
018public class SqlParser {
019        
020        private static Pattern MYSQL_DELIMITER_COMMAND = Pattern.compile("delimiter\\s+(.*)$", Pattern.CASE_INSENSITIVE);
021        
022        // could have a ParseState that contains things like the current delimiter, or current database
023        // (for 'use' commands), which can be modified by the InputStream, or the database vendor
024        // if we wanted to support escape sequences, which appear to be database-specific.
025        
026        /** Convert an InputStream of SQL statements into a List of individual
027         * statements. Statements are delimited by ";" strings that occur outside of strings or comments.
028         * The delimiter string is not included in the returned list of statements.
029         * 
030         * <p>The delimiter string may be changed using the 'delimiter str' command. 
031         * This command may end with a newline instead of a delimiter.
032         * 
033         * <p>Comments may also be returned in the result.
034         * <p>Comments are defined by '-- to-end-of-line' or '/* within slash-star star-slash *&#42;/' syntax.
035         * Comments that are created with '--' and that occur within a statement are returned before that statement
036         * has finished parsing.
037         * 
038         * <p>NB: Does not handle escape sequences found within double or single quotes
039         * (i.e. escape sequences are handled by the underlying database, not by this Parser)
040         * 
041         * @param is inputStream to parse
042         * @param includeComments include comment strings in result
043         * 
044         * @throws IOException 
045         * @throws ParseException unclosed /*-style comment or single/double-quoted string
046         */
047        public List<String> parseStatements(InputStream is, boolean includeComments) throws IOException, ParseException {
048                return parseStatements(new InputStreamReader(is), includeComments);
049        }
050        
051        /** Convert an InputStream of SQL statements into a List of individual
052         * statements. Statements are delimited by ";" strings that occur outside of strings or comments.
053         * The delimiter string is not included in the returned list of statements.
054         * 
055         * <p>The delimiter string may be changed using the 'delimiter str' command. 
056         * This command may end with a newline instead of a delimiter.
057         * 
058         * <p>Comments may also be returned in the result.
059         * <p>Comments are defined by '-- to-end-of-line' or '/* within slash-star star-slash *&#42;/' syntax.
060         * Comments that are created with '--' and that occur within a statement are returned before that statement
061         * has finished parsing.
062         * 
063         * <p>NB: Does not handle escape sequences found within double or single quotes
064         * (i.e. escape sequences are handled by the underlying database, not by this Parser)
065         * 
066         * @param reader Reader to parse
067         * @param includeComments include comment strings in result
068         * 
069         * @throws IOException 
070         * @throws ParseException unclosed /*-style comment or single/double-quoted string
071         */
072        public List<String> parseStatements(Reader r, boolean includeComments) 
073                throws IOException, ParseException 
074        {
075                final List<String> allSql = new ArrayList<String>();
076                consumeStatements(r, includeComments, new Consumer<String>() {
077                        @Override
078                        public void consume(String s) {
079                                allSql.add(s);
080                        }
081                });
082                return allSql;
083        }
084
085        public static interface Consumer<T> {
086                public void consume(T s);
087        }
088        
089        /** convert a text file of SQL statements into a List of individual
090         * statements. Comments may also be returned in the result.
091         * 
092         * <p>NB: Does not handle escape sequences found within double or single quotes
093         * 
094         * @param includeComments include comment strings in result
095         * 
096         * @throws IOException 
097         * @throws ParseException unclosed /*-style comment or single/double-quoted string
098         */
099        public void consumeStatements(InputStream is, boolean includeComments, Consumer<String> consumer) 
100                        throws IOException, ParseException {
101                consumeStatements(new InputStreamReader(is), includeComments, consumer);
102        }
103        
104        public void consumeStatements(Reader is, boolean includeComments, Consumer<String> consumer) 
105                throws IOException, ParseException 
106        {
107                // @TODO some error handling
108                // List allSql = new ArrayList<String>();
109                
110                int state = 0; // command parsing [to states 0, 1, 2, 4, 7, 8]
111                // 1 // parsed double quote, in double quotes [to states 0, 1, 9]
112                // 2 // parsed - [to states 0, 3]
113                // 3 // parsed -- [ to states 0, 3]
114                // 4 // parsed /  [ to states 0, 5]
115                // 5 // parsed /* [ to states 6, 5]
116                // 6 // parsed * from state 5  [ to states 0, 5]
117                // 7 // parse single quote, in single quotes [to states 0, 7, 10]
118                // 8 // parsed backslash in SQL, next character will be emitted without changing state [then to state 0]
119                // 9 // parsed backslash in double quote, next char emitted without changing state [then to state 1]
120                // 10 // parsed backslash in single quote, next char emitted without changing state [then to state 7]
121
122                String s = "";   // current statement
123                String snc = ""; // current statment without comments
124                String c = "";   // current comment
125                String delimiter = ";"; // default delimiter
126
127                int intch = is.read();
128                int delimIdx = 0; // number of delimiter characters read
129                while (intch!=-1) {
130                        char ch = (char) intch;
131                        
132                        if (state==0) {
133                                if (ch == delimiter.charAt(delimIdx)) {
134                                        delimIdx++;
135                                        if (delimiter.length()==delimIdx) {
136                                                s = s.trim(); delimIdx = 0; // was s.trim() + ";"
137                                                // could check for all commands at http://dev.mysql.com/doc/refman/5.7/en/mysql-commands.html
138                                                // but let's just implement 'DELIMITER' for now
139                                                Matcher m = MYSQL_DELIMITER_COMMAND.matcher(s);
140                                                if (m.matches()) {
141                                                        delimiter = m.group(1); // set the new delimiter
142                                                } else {
143                                                        consumer.consume(s); 
144                                                }
145                                                s="";
146                                        } else {
147                                                state = 0; 
148                                        }
149                                } else {
150                                        if (delimIdx > 0) {
151                                                // could push these back onto the inputStream in case the delimiter startsWith a " or -, but that seems a bit fiddly
152                                                s = s + delimiter.substring(0, delimIdx); delimIdx = 0; 
153                                        }
154                                        switch(ch) {
155                                                case '"' : state = 1; s = s + ch; break;
156                                                case '-' : state = 2; break;
157                                                case '/' : state = 4; break;
158                                                case '\'' : state = 7; s = s + ch; break;
159                                                case '\r' :
160                                                case '\n' :
161                                                        // if this is a delimiter command, process it
162                                                        Matcher m = MYSQL_DELIMITER_COMMAND.matcher(s.trim());
163                                                        if (m.matches()) {
164                                                                delimiter = m.group(1); // set the new delimiter
165                                                                s = ""; delimIdx = 0;
166                                                        } else {
167                                                                s = s + ch;
168                                                        }
169                                                        break;
170                                                case '\\': state = 8; s = s + ch; break;                                                        
171                                                default: s = s + ch;
172                                        }
173                                }
174                                
175                        } else if (state==1) {
176                                switch(ch) {
177                                        case '"' : state = 0; s = s + ch; break;
178                                        case '\\': state = 9; s = s + ch; break;
179                                        default: s = s + ch;
180                                }
181                        } else if (state==2) {
182                                switch(ch) {
183                                        case '-' : state = 3; break;
184                                        default: state = 0; s = s + "-" + ch;
185                                }
186                        } else if (state==3) {
187                                switch(ch) {
188                                        case '\r' : 
189                                        case '\n' : 
190                                                state = 0; 
191                                                if (includeComments) {
192                                                        // include comments in the statements the comments appear in, unless this is a comment outside of a statement
193                                                        s += "-- " + c.trim();
194                                                        if (s.trim().startsWith("--")) {
195                                                                consumer.consume(s.trim());
196                                                                s = "";
197                                                        } else {
198                                                                s += "\n";
199                                                        }
200                                                }
201                                                c = ""; break;
202                                        default :
203                                                c = c + ch;
204                                }
205                        } else if (state==4) {
206                                switch(ch) {
207                                        case '*' : state = 5; break;
208                                        default: state = 0; s = s + "/" + ch;
209                                }
210                        } else if (state==5) {
211                                switch(ch) {
212                                        case '*' : state = 6; break;
213                                        default: c = c + ch;
214                                }
215                        } else if (state==6) {
216                                switch(ch) {
217                                        case '/' : 
218                                                state = 0; 
219                                                if (includeComments) {
220                                                        // include comments in the statements the comments appear in, unless this is a comment outside of a statement
221                                                        s += ("/* " + c.trim() + " */");
222                                                        if (s.startsWith("/*")) {
223                                                                consumer.consume(s);
224                                                                s = "";
225                                                        }
226                                                } 
227                                                c=""; break;
228                                        default: c = c + "*" + ch;
229                                }
230                        } else if (state==7) {
231                                switch(ch) {
232                                        case '\'' : state = 0; s = s + ch; break;
233                                        case '\\' : state = 10; s = s + ch; break;
234                                        default: s = s + ch;
235                                }
236                        } else if (state==8) {
237                                // could check ch is printable here, but probably ok
238                                s = s + ch;
239                                state = 0;
240                        } else if (state==9) {
241                                s = s + ch;
242                                state = 1;
243                        } else if (state==10) {
244                                s = s + ch;
245                                state = 7;
246                        }
247                        
248                        intch = is.read();
249                }
250                
251                if (state==5) {
252                        // unclosed /*-style comment
253                        // ignore for the time being
254                        throw new ParseException("Unclosed /*-style comment before EOF", -1);
255                } else if (state==1) {
256                        // unclosed quoted string
257                        // ignore for the time being
258                        throw new ParseException("Unclosed double quoted string before EOF", -1);
259                } else if (state==7) {
260                        // unclosed quoted string
261                        // ignore for the time being
262                        throw new ParseException("Unclosed single quoted string before EOF", -1);
263                } else if (state==0) {
264                        if (!s.trim().equals("")) {
265                                // unterminated statement at end of InputStream; add to list
266                                consumer.consume(s.trim());
267                        }
268                }
269                
270                // return allSql;
271        }
272
273
274
275}