Notes On Regex Applications

# Recognize Me, Regex’ize Me

Mi'kail Eli'yah
9 min readDec 10, 2022

We are using Python as illustrations here.

import re

def format_detected(target_strings, patterns_regex, patterns_regex_types):
for i in range(0, len(target_strings)):
for j in range(0, len(patterns_regex_types)):
valid = re.compile(patterns_regex[j])
result = displaymatch(valid.match(test_strings[i]))

if result:
print(target_strings[i] + ": Search successful for " + patterns_regex_types[j])

return None

def displaymatch(match):
if match is None: # no match
return None
return '<Match: %r, groups=%r>' % (match.group(), match.groups())

# ===== [main] =====
target_to_scan_for = ['Email address', 'Private key', 'Seed phrase', 'Wallet address', 'Test case: 0', 'Phone number']

pattern_email = r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b"

patterns = [pattern_email, r"^[A-F0-9]{32}$", r"^(\w+\s){11}\w+$", r"^LB[a-fA-F0-9]{24}$", r"^[a2-9tjqk]{5}$", r"^(\+\d{1,2}\s?)?1?\-?\.?\s?\(?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}$"];
test_strings = ["axx@gmail.com", "A6FDF18E86000542388064492B58CBF1", "this is a long string of text consisting of twelve random words", "LB32b787573F5186C696b8ed61", "akt5q", "+1-000-456-7890"];

choice = 0;

# Unit test
valid = re.compile(patterns[choice])
result = displaymatch(valid.match(test_strings[choice]))
print(result)


if result:
print(target_to_scan_for[choice] + ": Search successful.")
else:
print(target_to_scan_for[choice] + ": Search unsuccessful.")

# Full-run test
patterns_regex = patterns
patterns_regex_types = target_to_scan_for
format_detected(test_strings, patterns_regex, patterns_regex_types)


test_string = "The meeting is at 3pm today or 5 A.M. tomorrow or 7 this afternoon or 00:00 midnight --- axx@gmail.com ---. Let's meet at 11.30 p.M. We can also do 8:45 pm or 1200 hr or 00hr."

matches_group_list = get_matches_group_list(pattern_email, test_string)
print(matches_group_list)

matches_locations_list = get_matches_locations(pattern_email, test_string)
print(matches_locations_list)

Sample result:

"""
axx@gmail.com: Search successful for Email address
A6FDF18E86000542388064492B58CBF1: Search successful for Private key
this is a long string of text consisting of twelve random words: Search successful for Seed phrase
LB32b787573F5186C696b8ed61: Search successful for Wallet address
akt5q: Search successful for Test case: 0
+1-000-456-7890: Search successful for Phone number
"""

You can also use short test to observe a specific unit case:

pattern_X = r"..."
test_string = "...."


# Unit test
valid = re.compile(pattern_X)
result = displaymatch(valid.match(test_string))
print(result)

if result:
print(test_string + ": Search successful.")
else:
print(test_string + ": Search unsuccessful.")

Email address

pattern_email = r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b"

test_string = "axx@gmail.com"

Private key formatted

In this example, it is to identify a string of characters that comprise an example private key. This key would consist of exactly 256 bits (32 characters) in an unspaced, capitalized, hexadecimal string located on one line.

pattern_private_key_formatted = r"^[A-F0–9]{32}$" 

test_string = "A6FDF18E86000542388064492B58CBF1"

Seed phrase

In this example, it is to identify a string of characters that comprise a seed phrase consisting of 12 words separated by a single space located on one line.

pattern_seed_phrase = r"^(\w+\s){11}\w+$"

test_string = "this is a long string of text consisting of twelve random words"

Wallet address

In this example, it is to identify a string of characters that comprise an example public wallet address. This address would consist of exactly 24 characters in an unspaced, hexadecimal string preceded by the literal letters “LB”.

pattern_wallet_address = r"^[13][a-km-zA-HJ-NP-Z1-9]{25,34}$"

test_string = "1A1zP1eP5QGefi2DMPTfTL5SLmv7DivfNa"

Phone number

pattern_phone_number = r"^(\+\d{1,2}\s?)?1?\-?\.?\s?\(?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}$"

test_string = "+1-000-456-7890"

IP address


pattern_ip_address= r"\b(25[0–5]|2[0–4][0–9]|[01]?[0–9][0–9]?)\.(25[0–5]|2[0–4][0–9]|[01]?[0–9][0–9]?)\.(25[0–5]|2[0–4][0–9]|[01]?[0–9][0–9]?)\.(25[0–5]|2[0–4][0–9]|[01]?[0–9][0–9]?)\b"
# IPv4 only: ^(([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\.){3}([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])$
# IPv6 only: (([0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,7}:|([0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,5}(:[0-9a-fA-F]{1,4}){1,2}|([0-9a-fA-F]{1,4}:){1,4}(:[0-9a-fA-F]{1,4}){1,3}|([0-9a-fA-F]{1,4}:){1,3}(:[0-9a-fA-F]{1,4}){1,4}|([0-9a-fA-F]{1,4}:){1,2}(:[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:((:[0-9a-fA-F]{1,4}){1,6})|:((:[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(:[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(ffff(:0{1,4}){0,1}:){0,1}((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])|([0-9a-fA-F]{1,4}:){1,4}:((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9]))
""" e.g.
1200:0000:AB00:1234:0000:2552:7777:1313
1200::AB00:1234::2552:7777:1313
21DA:D3:0:2F3B:2AA:FF:FE28:9C5A
1200:0000:AB00:1234:O000:2552:7777:1313 // invalid characters present
FE80:0000:0000:0000:0202:B3FF:FE1E:8329
[2001:db8:0:1]:80 // valid, no support for port numbers
http://[2001:db8:0:1]:80 // valid, no support for IP address in a URL
"""
# For both IPv4 and IPv6: ((^\s*((([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\.){3}([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5]))\s*$)|(^\s*((([0-9A-Fa-f]{1,4}:){7}([0-9A-Fa-f]{1,4}|:))|(([0-9A-Fa-f]{1,4}:){6}(:[0-9A-Fa-f]{1,4}|((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3})|:))|(([0-9A-Fa-f]{1,4}:){5}(((:[0-9A-Fa-f]{1,4}){1,2})|:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3})|:))|(([0-9A-Fa-f]{1,4}:){4}(((:[0-9A-Fa-f]{1,4}){1,3})|((:[0-9A-Fa-f]{1,4})?:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){3}(((:[0-9A-Fa-f]{1,4}){1,4})|((:[0-9A-Fa-f]{1,4}){0,2}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){2}(((:[0-9A-Fa-f]{1,4}){1,5})|((:[0-9A-Fa-f]{1,4}){0,3}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){1}(((:[0-9A-Fa-f]{1,4}){1,6})|((:[0-9A-Fa-f]{1,4}){0,4}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(:(((:[0-9A-Fa-f]{1,4}){1,7})|((:[0-9A-Fa-f]{1,4}){0,5}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:)))(%.+)?\s*$))
""" e.g.
1200:0000:AB00:1234:0000:2552:7777:1313
21DA:D3:0:2F3B:2AA:FF:FE28:9C5A
1200:0000:AB00:1234:O000:2552:7777:1313 // invalid characters present
FE80:0000:0000:0000:0202:B3FF:FE1E:8329
[2001:db8:0:1]:80 // valid, no support for port numbers
http://[2001:db8:0:1]:80 // valid, no support for IP address in a URL
0.0.0.0
9.255.255.255
"""

test_string = "255.255.11.135" # -ve case: "999.255.11.135"

Directory path

pattern_directory_path= r"([^/\\]*)"
"""
File Path with Filename and extension:
((\/|\\|\/\/|https?:\\\\|https?:\/\/)[a-z0-9 _@\-^!#$%&+={}.\/\\\[\]]+)+\.[a-z]+$

File Path with optional Filename, extension:
^(.+)/([^/]+)$

File Name with extension having 3 chars:
^[\w,\s-]+\.[A-Za-z]{3}$
"""

test_string = "this file is from \My Documents\sensitive\passcodes" # or "this file is from /My Documents/sensitive/passcodes"

URL

# Protocol : http(s)
pattern_X = r"https?:\/\/(www\.)?[-a-zA-Z0–9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0–9@:%_\+.~#()?&//=]*)"
pattern_X = r"(?=(.*[0-9]))(?=.*[\!@#$%^&*()\\[\]{}\-_+=~`|:;\"'<>,./?])(?=.*[a-z])(?=(.*[A-Z]))(?=(.*)).{8,}"
# Protocol Optional
pattern_X = r"(https?:\/\/)?(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)"

# general: url_pattern = "(https?)://(www)?.?(\\w+).(\\w+)/?(\\w+)?"

test_string = "http://foo.com/blah_(wikipedia)#cite-1"

Test passcode strength requirements

Test for specific passcode strength requirements, e.g. should have 1 lowercase letter, 1 uppercase letter, 1 number, 1 special character and be at least 8 characters long

pattern_X = r"(?=(.*[0-9]))(?=.*[\!@#$%^&*()\\[\]{}\-_+=~`|:;\"'<>,./?])(?=.*[a-z])(?=(.*[A-Z]))(?=(.*)).{8,}"

test_string = "sEcReT_*(8???)"

HTML tags

pattern_X = r"<\/?[\w\s]*>|<.+[\W]>"test_string = "<p>\"Hello ... hello.\"</p>"

Javascript handlers

pattern_X = r"\bon\w+=\S+(?=.*>)"

test_string = """<img src="foo.jpg" onload=function_xyz />
<img onmessage="javascript:execute()">
<a notonmessage="nomatch-here" onfocus="alert('hey')" onclick=foo() disabled>
"""

Search duplicates in text

pattern_X = r"(\b\w+\b)(?=.*\b\1\b)"

# needs to be translated from """ ... """ to "..." before use
test_string = """
world .... world .... world world ...
"""

Address (Geolocation)

pattern_geolocation_address = "(\\d*)\\s?(.+),\\s(.+)\\s([A-Z]{2,3})\\s(\\d{4})"

Notes:
"""
(\\d*) := 0 or more digit characters because some addresses do not have house numbers (1st capture group i.e. house number)
\\s? := 0 or 1 whitespace character
(.+) := 1 or more characters (2nd capture group i.e. street name)
, := comma
\\s := a single whitespace character
(.+) := 1 or more characters (3rd capture group i.e. suburb)
\\s := a single whitespace character
([A-Z]{2,3}) := 2 or 3 uppercase letters (4th capture group i.e. state)
\\s := a single whitespace character
(\\d{4}) := 4 digit characters (5th capture group i.e. postcode)
"""

pattern_geolocation_coordinates = "(\d{1,4})([.])(\d{4})[°](\s?)([NS]?)[NSEW](\s?)((\s+?)?)[,](\s?)((\s+?)?)(\d{1,4})([.])(\d{4})[°](\s?)([NS]?)[NSEW]"
test_string = "38.8719° N , 77.0563° SW" # 90.0000° S, 45.0000° E

Hexadecimal strings

pattern_X = r"^(0x|0X)?[a-fA-F0-9]+$"
test_string = "0xaa005a" # aa005a

Linux permissions

pattern_X = r"[0-7]{1,4}" # linux-file-permissions-numeric-notation

test_string = "666" # "0655"

"""
pattern_X = r"[d|-|b|c|l|p|s|D][r|w|x|-]{9}"
test_string = "drwxr-xr-x"
"""

File types

pattern_X = r"(?i)^.*\.(jpg|jpeg|gif|doc|pdf|txt)$" # (?i): case insensitive
test_string = "hello.JpEg" #

Timestamp

pattern_date = (
r"(\d{2}|\d{4})[- \/.]((0|1|2)\d{1})[- \/.](\d{2,4})"
)

test_strings = ["22/02/2222_0000hr", "22.02.2222", "22-02.2222", "2222/02/22", "2222-02/22", "2222-02.22"];


pattern_date = (
r"(\d{1,2}|\d{4})[- /.]((J|j)an(uary)?|(F|f)eb(ruary)?|(M|m)ar(ch)?|(A|a)pr(il)?|(M|m)ay|(J|j)un(e)?|(J|j)ul(y)?|(A|a)ug(ust)?|(S|s)ep(t(ember)?)?|(O|o)ct(ober)?|(N|n)ov(ember)?|(D|d)ec(ember)?)[- /.](\d{1,2}|\d{4})"
)

test_strings = ["02-May-2011", "02 May 2011", "11.may 2011", "11.june 2011", "11.Jul 2011"];

Ref: https://www.regextester.com

Time (without dates)

import re

def get_matches_group_list(pattern, string_target):
matches_group_list = [m.group() for m in re.finditer(pattern, string_target, re.I)]

return matches_group_list

def get_matches_locations(pattern, string_target):
matches_locations_list = [(m.start(), m.end()) for m in re.finditer(pattern, string_target, re.I)]

return matches_locations_list

pattern_time = (
r"\b\d{1,2}([: .]?)(\d{2})?(\s?)((P|p)|(A|a)?)(.?)((M|m)?)(.?)((next|this)?)(\s?)((tomorrow|today|day|evening|morning|(mid?)night|((after|before)?)(noon))?)\b"
)

test_string = "The meeting is at 3pm today or 5 A.M. tomorrow or 7 this afternoon or 00:00 midnight. Let's meet at 11.30 p.M. We can also do 8:45 pm or 1200 hr or 00hr."

matches_group_list = get_matches_group_list(pattern_time, test_string)
print(matches_group_list)

matches_locations_list = get_matches_locations(pattern_time, test_string)
print(matches_locations_list)

Specific format

# Example 1:
pattern_X = r"^[#%][a-zA-Z0-9]{22,30}$"

test_string = "#6FDF18E86000542388064492B58CB" # 30 characters
test_string = "%6FDF18E86000542388064492B58CB" # 30 characters

# Example 2:
length_of_1st_segment = '{5,10}'
pattern_X = "^[#%][a-zA-Z0-9+]"
pattern_X = pattern_X + length_of_1st_segment + "$"

test_string = "#6FDF1+++" # 9 characters

# Example 3:
length_of_1st_segment = '{5,10}'
pattern_X = "^[#%][a-zA-Z0-9]"
pattern_X = pattern_X + length_of_1st_segment
length_of_2nd_segment = '{1,3}'
pattern_X = pattern_X + "[+@]" + length_of_2nd_segment + "$"

test_string = "#6FDF1@@@" # 9 characters

# Example 4:
length_of_1st_segment = '{5,10}'
pattern_X = "^[#%][a-zA-Z0-9]"
pattern_X = pattern_X + length_of_1st_segment
length_of_2nd_segment = '{1,3}'
pattern_X = pattern_X + "[+@]" + length_of_2nd_segment
length_of_3rd_segment = '{2,3}'
pattern_X = pattern_X + "[0-9]" + length_of_3rd_segment + "$"

test_string = "#6FDF1@@@01" # 11 characters

N-gram

# Example 1:
pattern_3_gram_with_numbers = r"\b\w{3}\b"
pattern_3_gram_without_numbers = r"\b[a-zA-Z]{3}\b"
pattern_4_gram = r"\b\w{4}\b"
pattern_4_gram_without_numbers = r"\b[a-zA-Z]{4}\b"

pattern_3_gram_with_numbers_only =r"\b[0-9]{3}\b" # numbers only with length = 3

pattern_3_gram_non_alphanumeric_non_space =r"[^a-zA-Z0-9 ]{3}"

pattern_arbitrary_gram_with_numbers_only_non_space =r"([^a-zA-Z\"=_<>?./\\'{}|+_\-&^%$#@!~`:;, ][0-9]*)"

test_string = "top regular 111 expressions 123523423 halo <<< y 2g2 k0l 111111 "

# Example 2: List the locations as well
import re

pattern_arbitrary_gram_with_numbers_only_non_space =r"([^a-zA-Z\"=_<>?./\\'{}|+_\-&^%$#@!~`:;, ][0-9]*)"

test_string = "top regular 111 expressions 123523423 halo <<< y 2g2 k0l 111111 "

matches = [m.group() for m in re.finditer(pattern_arbitrary_gram_with_numbers_only_non_space, test_string, re.I)]
print(matches)

match_locations = [(m.start(), m.end()) for m in re.finditer(pattern_arbitrary_gram_with_numbers_only_non_space, test_string, re.I)]
print(match_locations)

Template Detection

Email headers

# example of email header template detection
"""
According to Internet Message Format (RFC 5322), mandatory fields include:
Date: Specifies the date and time the message was sent.
From: Specifies the email address of the sender.
To: Specifies the email address(es) of the intended recipient(s).
Message-ID: Provides a unique identifier for the message.
"""
import re

# Example email message
email_message = """
Delivered-To: john.doe@gmail.com
Received: by 10.107.28.21 with SMTP id p21csp515449iof;
Wed, 21 Apr 2023 09:12:14 -0700 (PDT)
X-Received: by 2002:a1c:4c12:: with SMTP id z18mr15577774wmc.160.1618971834241;
Wed, 21 Apr 2023 09:12:14 -0700 (PDT)
Received: from mail.example.com (mail.example.com. [203.0.113.1])
by mx.google.com with ESMTPS id v13si4868822wru.14.2023.04.21.09.12.14
for <john.doe@gmail.com>
(version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256);
Wed, 21 Apr 2023 09:12:14 -0700 (PDT)
From: Jane Doe <jane.doe@example.com>
To: John Doe <john.doe@gmail.com>
Subject: Example email
Message-ID: <CAJG9ZdPY7f4Kv4w_Eh1xjR-e8zm5Wzr5r3qvy3ZiajP8PaQW-g@mail.example.com>
Date: Wed, 21 Apr 2023 16:12:13 +0000
User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:88.0) Gecko/20100101 Thunderbird/88.0
MIME-Version: 1.0
Content-Type: multipart/alternative;
boundary="------------030204060:

This is an example email message.
"""

# Regular expression patterns for email headers
pattern_to = r'^To: (.+)$'
pattern_from = r'^From: (.+)$'
pattern_subject = r'^Subject: (.+)$'
pattern_message_id = r'^Message-ID: (.+)$'
pattern_date = r'^Date: (.+)$'

# Extract the headers using regular expressions
match_to = re.search(pattern_to, email_message, flags=re.MULTILINE)
match_from = re.search(pattern_from, email_message, flags=re.MULTILINE)
match_subject = re.search(pattern_subject, email_message, flags=re.MULTILINE)
match_message_id = re.search(pattern_message_id, email_message, flags=re.MULTILINE)
match_date = re.search(pattern_date, email_message, flags=re.MULTILINE)

# Print the extracted header values
if match_to:
print("To:", match_to.group(1))
if match_from:
print("From:", match_from.group(1))
if match_subject:
print("Subject:", match_subject.group(1))
if match_message_id:
print("Message-ID:", match_message_id.group(1))
if match_date:
print("Date:", match_date.group(1))

if (match_to) and (match_from) and (match_subject) and (match_message_id) and (match_date):
print("[email header template detected]")

In Between Lines

import re

test_strings = """
....
....
[start]
you should
see me
[end]
.....
....
"""
# Define the start and end tags as regular expressions
start_tag = r'\[start\]'
end_tag = r'\[end\]'

# Construct a regex pattern to match everything between the start and end tags
pattern = f'{start_tag}(.*?){end_tag}'
# Use re.findall to extract the text between the tags
matches = re.findall(pattern, test_strings, re.DOTALL)

# Print the extracted text
for match in matches:
print(match.strip()) # Use strip() to remove leading/trailing whitespace

"""
you should
see me
"""
_
import re

test_strings = """
....
....
<input required type="hidden" name="

id_tracking_no value = "
random ...
random ....
"

">
.....
....
"""

# Construct a regex pattern to match everything between the start and end tags
pattern = f'id_tracking_no value = "([^"]+)"'

# Use re.findall to extract the text between the tags
matches = re.findall(pattern, test_strings, re.DOTALL)

# Print the extracted text
for match in matches:
print(match.strip()) # Use strip() to remove leading/trailing whitespace

""" output:
random ...
random ....
"""
https://regex101.com/
_

More detection on using extract and hash compare: Template Matching.

Just in case … here is a good library utility:

https://ihateregex.io/

Caveats

Some life-wasting bugs are almost beyond mortal eyes:

"""
line1 = "^[a-zA-Z0–9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$"
line2 = "^[a-zA-Z0–9_.+-]+@[a-zA-Z0–9-]+\.[a-zA-Z0–9-.]+$"

for i in range(0, len(line1)):
if (line1[i] != line2[i]):
print(line1[i] + " is different from " + line2[i] + " at position " + str(i))
"""

Result:
- is different from – at position 26
- is different from – at position 41

--

--

Mi'kail Eli'yah
Mi'kail Eli'yah

No responses yet