Monday, February 27, 2012

Regex look-ahead/behind syntax

//http://groovy.codehaus.org/Regular+Expressions
def r = /e+/
def str = 'me cheese please'
def m = str =~ r

assert m.size() == 5
println m*.toString() //[e, ee, e, e, e]

/////////////////////////////

def r = /(This boy) is/
def str = 'This boy is 10. This boy wants chocolate. This boy is tall.'
def m = str =~ r

assert m.size() == 2
assert m.collect { it[1] } == ['This boy','This boy']

/////////////////////////////

assert "abc".replaceAll(/(a)(b)(c)/, "\$1\$3") == 'ac' //back references

/////////////////////////////

//http://www.regular-expressions.info/captureall.html
r = /((?:abc|123)+).*?/
str = '123abc 123abc123'
m = str =~ r

assert m.size() == 2
println m*.toString()
print m[0][1..-1]
println m[1][1..-1]

/////////////////////////////

//password of 8 characters long and two non-letters
def r1 = /.*[^a-zA-Z].*[^a-zA-Z].*(?<=.{7})/  //look-behind ?<=

assert 'abc' !=~ r1
assert 'abcde12' !=~ r1
assert 'abcdef12' ==~ r1
assert 'abc1defgggg2' ==~ r1
assert 'abc1defgggg' !=~ r1

//word is foo$wrd*, store $wrd*
def wrd = 'bar'
def r2 = /foo((?=$wrd)[\w]+)/  //look-ahead ?=

def foo = "foo${wrd}hellow"
assert foo ==~ r2
def m = foo =~ r2
assert m[0][1] == "${wrd}hellow" // note: $wrd not consumed by check, is stored in result

foo = 'foohellow' // no $wrd
assert foo !=~ r2

//ADVANCED/////////////////////////////
def churnText(String text) {
    def points = [[k: ~/(N|n)orth(E|e)ast(ern)?/                                                                , v:'NE'],
              [k: ~/(?>(N|n)orth(W|w)est(ern|:)?)(?! Territories)/                                              , v:'NW'],
              [k: ~/(S|s)outheast(ern)?/                                                                        , v:'SE'],
              [k: ~/(?>(S|s)outh(\s)?(W|w)est(ern)?)(?! Hill| Bend)/                                            , v:'SW'],
              [k: ~/(?>(N|n)orth(ern)?|Upstate)(?! Carolina| Dakota| Platte| Neck| Mariana Islands| Bay|ridge)/ , v:'N' ],
              [k: ~/(E|e)ast(ern)?/                                                                             , v:'E' ],
              [k: ~/(?>(S|s)outh(ern|side)?)(?! Carolina| Dakota)/                                              , v:'S' ],
              [k: ~/(?!(?<=George ))(?>(W|w)est(ern| of the)?)(?! Virginia| Palm Beach)/                        , v:'W' ],
              [k: ~/(?>(C|c)entral|Center|Middle|the middle section of the)(?!town| Peninsula| Tennessee|ia)/   , v:'C' ]]
    
    points.each {p ->
      def matcher = (text =~ p.k)
      text = matcher.replaceAll(p.v)
      println "p.v: ${p.v} text: $text"
    }
}

churnText('Northwest Virginia')
println '='*40
churnText('NorthWestern Territories')
println '='*40
churnText('East George West')
return

No comments:

Post a Comment