-
Notifications
You must be signed in to change notification settings - Fork 0
/
class.Find_Dates_in_String.rb
901 lines (812 loc) · 50.6 KB
/
class.Find_Dates_in_String.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
require 'Date'
require 'class.Date.extend.rb'
require 'class.Array.extend.rb'
require 'class.Symbol.extend.rb'
require 'module.SE.rb'
require 'module.ArchivesSpace.Konstants.rb'
class Find_Dates_in_String
def initialize( option_H = {})
binding.pry if ( respond_to? :pry )
if ( not option_H.is_a?( Hash ) ) then
SE.puts "#{SE.lineno}: Expected param to be a type HASH."
SE.q { 'option_H' }
raise
end
@uid_string = "DATE_CLUMP_#:"
@option_H = option_H.merge( {} )
@option_H.each_key do | option_H_key |
case option_H_key
when :debug_options
case true
when @option_H[ option_H_key ].is_a?( Hash )
@option_H[ option_H_key ].each_pair do | key, value |
if ( not key.is_a?( Symbol ) ) then
SE.puts "#{SE.lineno}: Expected '#{key}' to be of type 'Symbol', not '#{key.class}'"
SE.q { 'key' }
SE.q { 'option_H' }
raise
end
SE.puts "#{SE.lineno}: Debug option: ':#{key}' = '#{value}' set. Note: Option spelling is NOT checked!!!"
end
when @option_H[ option_H_key ].is_a?( Symbol )
h = { @option_H[ option_H_key ] => nil }
SE.puts "#{SE.lineno}: Debug option: ':#{@option_H[ option_H_key]}' = 'nil' set. Note: Option spelling is NOT checked!!!"
@option_H[ option_H_key ] = h
when @option_H[ option_H_key ].is_a?( Array )
h = {}
@option_H[ option_H_key ].each do | element |
if ( not element.is_a?( Symbol ) ) then
SE.puts "#{SE.lineno}: Expected '#{element}' to be of type 'Symbol', not '#{element.class}'"
SE.q { 'element' }
SE.q { 'option_H' }
raise
end
SE.puts "#{SE.lineno}: Debug option: ':#{element}' = 'nil' set. Note: Option spelling is NOT checked!!!"
h[ element ] = nil
end
@option_H[ option_H_key ] = h
else
SE.puts "#{SE.lineno}: Expected '#{option_H_key}' to be of type Symbol, Hash, or Array, not '#{@option_H[ option_H_key ].class}'"
SE.q { 'option_H' }
raise
end
when :morality_replace_option
if ( not @option_H[ option_H_key ].is_a?( Hash ) ) then
SE.puts "#{SE.lineno}: Expected '#{option_H_key}' to be a type Hash, not '#{@option_H[ option_H_key ].class}'"
SE.q { 'option_H' }
raise
end
@option_H[ option_H_key ].each_pair do | key, value |
case key
when :good
if ( not ( value.is_a?( Symbol ) and value.in?( [ :keep, :remove ] ) ) ) then
SE.puts "#{SE.lineno}: option_H[ :morality_replace_option ][ #{key} ] should be [ :keep, :remove ]"
SE.q { 'option_H' }
raise
end
when :bad
if ( not ( value.is_a?( Symbol ) and value.in?( [ :keep, :remove ] ) ) ) then
SE.puts "#{SE.lineno}: option_H[ :morality_replace_option ][ #{key} ] should be [ :keep, :remove ]"
SE.q { 'option_H' }
raise
end
else
SE.puts "#{SE.lineno}: unknown :morality_replace_option '#{key}', it should either be :good or :bad (obviously)"
SE.q { 'option_H' }
raise
end
end
when :thru_date_separators
case true
when @option_H[ option_H_key ].is_a?( Array ) then
ary = []
@option_H[ option_H_key ].each do | separator |
separator.strip!
ary << Regexp::escape( separator )
end
@option_H[ option_H_key ] = ary.join("|")
when @option_H[ option_H_key ].is_a?( String )
@option_H[ option_H_key ] = Regexp::escape( @option_H[ option_H_key ] )
else
SE.puts "#{SE.lineno}: Expected '#{option_H_key}' to be of type Array or String not '#{@option_H[ option_H_key ].class}'"
SE.puts "If more than one is needed, pass them in as an array. The default is: '-| through '."
SE.q { 'option_H' }
raise
end
when :date_clump_separators
if ( @option_H[ option_H_key ].is_a?( Symbol ) ) then
if ( @option_H[ option_H_key ] == :none ) then
@option_H[ :date_clump_separators ] = 255.chr # Use 255.chr \xFF for none
else
SE.puts "#{SE.lineno}: option_H[ :date_clump_separators ] should be :none, or [xyz] (where xyz = some separators)."
SE.q { 'option_H' }
raise
end
else
if ( not ( @option_H[ option_H_key ].length > 1 and @option_H[ option_H_key ] =~ /\[\W\]+/ ) ) then
SE.puts "#{SE.lineno}: option_H[ :date_clump_separators ] should be :none, or [xyz] (where xyz = some separators)."
SE.q { 'option_H' }
raise
end
end
when :pattern_name_RES
if ( not @option_H[ option_H_key ].is_a?( String ) ) then
SE.puts "#{SE.lineno}: option_H[ :pattern_name_RES ] should be an String that will convert to a regexp."
SE.q { 'option_H' }
raise
end
when :default_century
default_century = @option_H[ option_H_key ]
if ( not (default_century.integer? and (default_century.length == 2 or (default.century.length == 4 and default_century[ 2..3 ] != "00" )))) then
SE.puts "#{SE.lineno}: Expected the :default_century to be NN00 (or NN), not '#{default_century}'"
raise
end
@option_H[ option_H_key ] = @option_H[ option_H_key ][0..1]
when :yyyy_min_value
if ( not (@option_H[ option_H_key ].integer? and @option_H[ option_H_key ].length == 4 )) then
SE.puts "#{SE.lineno}: Expected the :yyyy_min_value to be NNNN, not '#{@option_H[ option_H_key ]}'"
raise
end
when :yyyy_max_value
if ( not (@option_H[ option_H_key ].integer? and @option_H[ option_H_key ].length == 4 )) then
SE.puts "#{SE.lineno}: Expected the :yyyy_max_value to be NNNN, not '#{@option_H[ option_H_key ]}'"
raise
end
when :date_string_composition
if ( not (@option_H[ option_H_key ].is_a?( Symbol ) and @option_H[ option_H_key ].in?( [ :only_dates, :dates_in_text ] ))) then
SE.puts "#{SE.lineno}: Expected :date_string_composition to be :only_dates or :dates_in_text, not '#{@option_H[ option_H_key ]}'"
raise
end
when :nn_mmm_nn_day_year_order
if ( not (@option_H[ option_H_key ].is_a?( Symbol ) and @option_H[ option_H_key ].in?( [ :dd_mm_yy, :yy_mm_dd ] ))) then
SE.puts "#{SE.lineno}: Expected :nn_mmm_nn_day_year_order to be :dd_mm_yy or :yy_mm_dd, not '#{@option_H[ option_H_key ]}'"
raise
end
when :nn_nn_nn_date_order
if ( not (@option_H[ option_H_key ].is_a?( Symbol ) and @option_H[ option_H_key ].in?( [ :mm_dd_yy, :dd_mm_yy, :yy_mm_dd ] ))) then
SE.puts "#{SE.lineno}: Expected :nn_nn_nn_date_order to be :mm_dd_yy, :dd_mm_yy, or :yy_mm_dd not '#{@option_H[ option_H_key ]}'"
raise
end
when :sort
if ( not ( [true, false].include?( @option_H[ option_H_key ] ) ) ) then
SE.puts "#{SE.lineno}: Expected '#{option_H_key}' to be true or false, not '#{@option_H[ option_H_key ]}'"
SE.q { 'option_H' }
raise
end
else
SE.puts "#{SE.lineno}: invalid option_H option: '#{option_H_key}'"
SE.q { '@option_H' }
raise
end
end
if ( not @option_H.key?( :debug_options ) )
@option_H[ :debug_options ] = []
end
if ( not @option_H.key?( :morality_replace_option ) )
@option_H[ :morality_replace_option ] = { }
end
if ( not @option_H[ :morality_replace_option ].key?( :good ) ) then
@option_H[ :morality_replace_option ][ :good ] = :remove
end
if ( not @option_H[ :morality_replace_option ].key?( :bad ) ) then
@option_H[ :morality_replace_option ][ :bad ] = :keep
end
if ( not @option_H.key?( :thru_date_separators ) ) then
@option_H[ :thru_date_separators ] = '-| through '
end
if ( not @option_H.key?( :date_clump_separators ) ) then
@option_H[ :date_clump_separators ] = '[|/]| and '
end
if ( not @option_H.key?( :pattern_name_RES ) )
then
@option_H[ :pattern_name_RES ] = '.'
end
if ( not @option_H.key?( :default_century ) )
then
@option_H[ :default_century ] = "19"
end
if ( not @option_H.key?( :date_string_composition ) ) then
@option_H[ :date_string_composition ] = :dates_in_text
end
if ( not @option_H.key?( :yyyy_min_value ) ) then
@option_H[ :yyyy_min_value ] = '1800'
end
if ( not @option_H.key?( :yyyy_max_value ) ) then
@option_H[ :yyyy_max_value ] = '2100'
end
if ( not @option_H.key?( :nn_mmm_nn_day_year_order ) ) then
@option_H[ :nn_mmm_nn_day_year_order ] = :dd_mm_yy
end
if ( not @option_H.key?( :nn_nn_nn_date_order ) ) then
@option_H[ :nn_nn_nn_date_order ] = :mm_dd_yy
end
if ( not @option_H.key?( :sort ) ) then
@option_H[ :sort ] = true
end
# dash_RES = "\\s{0,2}-\\s{0,2}"
dash_slash_RES = "\\s{0,2}(?:-|/){1}\\s{0,2}" # Note the \\ because it's a double quoted string.
space_dash_RES = "\\s{0,2}(?:\\s|-){1}\\s{0,2}"
space_dash_slash_RES = "\\s{0,2}(?:\\s|-|/){1}\\s{0,2}"
comma_RES = "\\s{0,2},\\s{0,2}"
space_comma_RES = "\\s{0,2}(?:\\s|,){1}\\s{0,2}"
space_RES = "\\s{0,3}"
# soft_month_RES = "(?:([a-z]{3,11}|[a-z]{3,3}\\.)){1}" # Vague alpha months
hard_month_RES = K.month_RES # Specific month names
month_RES = hard_month_RES
n_nn_RES = "(?:\\d{1,2})" # For known day positions and numeric month positions
nn_nnnn_RES = "(?:\\d{2,4})" # For possible year positions
@thru_date_separator_RES = "(?:\\s{0,2}(?:#{@option_H[ :thru_date_separators ]})\\s{0,2}){1}"
@thru_date_begin_delim_RES = "^\\s*"
@begin_delim_RES = "(?:\\s|(?:#{@option_H[ :date_clump_separators ]}))*"
@end_delim_RES = "\\s*(?:#{@thru_date_separator_RES}|\\W|$){1}" # The \\W will match any separators
date_pattern_RES_S = Struct.new( :pattern_name, :pattern_RES ) # The :pattern_name and length are computed and added later.
# Literal spaces in the pattern are removed. Spaces are just
# for ease of reading. To match a space, use \\s instead.
# And, it HAS to be \\ because it's a double quoted string.
initial__date_pattern_RES_S__A = []
# fmt002__ = possible year
initial__date_pattern_RES_S__A << date_pattern_RES_S.new( nil,
"(?<fmt002__yyyy> (?<year_M>#{nn_nnnn_RES}))" )
# fmt003__ = Dates in 'mmm dd, yyyy' format with spaces between 'mmm' 'dd'
initial__date_pattern_RES_S__A << date_pattern_RES_S.new( nil,
"(?<fmt003__MMM_dd_yyyy> (?<month_M>#{month_RES})#{space_RES} (?<day_M>#{n_nn_RES})#{comma_RES} (?<year_M>#{nn_nnnn_RES}))" )
# fmt004__ = Dates in 'mmm yyyy' or 'mmm-yyyy' format
initial__date_pattern_RES_S__A << date_pattern_RES_S.new( nil,
"(?<fmt004__MMM_yyyy> (?<month_M>#{month_RES})#{space_dash_RES} (?<year_M>#{nn_nnnn_RES}))" )
# fmt005__ = Dates in 'yyyy mmm' or 'yyyy-mmm' format
initial__date_pattern_RES_S__A << date_pattern_RES_S.new( nil,
"(?<fmt005__yyyy_MMM> (?<year_M>#{nn_nnnn_RES})#{space_dash_RES} (?<month_M>#{month_RES}))" )
# fmt006__ = Dates in 'dd [-/] mmm [-/] yyyy' or ' yyyy [-/] mmm [-/] dd' format
initial__date_pattern_RES_S__A << date_pattern_RES_S.new( nil,
"(?<fmt006__nn_MMM_nn> (?:" +
" (?: (?<nn_1st_M>#{n_nn_RES})#{space_dash_slash_RES} (?<month_M>#{month_RES})#{space_dash_slash_RES} (?<nn_3rd_M>#{n_nn_RES}))" +
"| (?: (?<nn_1st_M>#{n_nn_RES})#{space_dash_slash_RES} (?<month_M>#{month_RES})#{space_dash_slash_RES} (?<nn_3rd_M>#{nn_nnnn_RES}))" +
"| (?: (?<nn_1st_M>#{nn_nnnn_RES})#{space_dash_slash_RES} (?<month_M>#{month_RES})#{space_dash_slash_RES} (?<nn_3rd_M>#{n_nn_RES}))" +
" ) )" )
# fmt008__ = Dates in 'mmm dd - dd, yyyy format (hybid double)
initial__date_pattern_RES_S__A << date_pattern_RES_S.new( nil,
"(?<fmt008__MMM_dd_dd_yyyy> (?<month_M>#{month_RES})#{space_RES} (?<day_M>#{n_nn_RES})"+
"#{@thru_date_separator_RES} (?<thru_day_M>#{n_nn_RES})#{space_comma_RES} (?<year_M>#{nn_nnnn_RES}))" )
# fmt008__ = Dates in 'mmm dd - mmm dd, yyyy format (hybid double)
initial__date_pattern_RES_S__A << date_pattern_RES_S.new( nil,
"(?<fmt009__MMM_dd_MMM_dd_yyyy> (?<month_M>#{month_RES})#{space_RES} (?<day_M>#{n_nn_RES})"+
"#{@thru_date_separator_RES} (?<thru_month_M>#{month_RES})#{space_comma_RES} (?<thru_day_M>#{n_nn_RES})#{comma_RES}(?<year_M>#{nn_nnnn_RES}))" )
# fmt010__ = Dates in 'mmm-mmm yy[yy] format (hybid double) Note there's NO COMMA after the month
initial__date_pattern_RES_S__A << date_pattern_RES_S.new( nil,
"(?<fmt010__MMM_MMM_yyyy> (?<month_M>#{month_RES})#{space_RES}"+
"#{@thru_date_separator_RES} (?<thru_month_M>#{month_RES})#{space_RES} (?<year_M>#{nn_nnnn_RES}))" )
# fmt013__ = All numeric dates 'nn [-/] nn [-/] nn ' format, the 1st and 3rd positions could be 1 or 4 digets (days or years)
initial__date_pattern_RES_S__A << date_pattern_RES_S.new( nil,
"(?<fmt013__nn_nn_nn> (?:" +
" (?: (?<nn_1st_M>#{n_nn_RES})#{dash_slash_RES} (?<nn_2nd_M>#{n_nn_RES})#{dash_slash_RES} (?<nn_3rd_M>#{n_nn_RES}))" +
"| (?: (?<nn_1st_M>#{n_nn_RES})#{dash_slash_RES} (?<nn_2nd_M>#{n_nn_RES})#{dash_slash_RES} (?<nn_3rd_M>#{nn_nnnn_RES}))" +
"| (?: (?<nn_1st_M>#{nn_nnnn_RES})#{dash_slash_RES} (?<nn_2nd_M>#{n_nn_RES})#{dash_slash_RES} (?<nn_3rd_M>#{n_nn_RES}))" +
" ) )" )
# Set the pattern_name and length
initial__date_pattern_RES_S__A.each_index do | idx |
stringer = initial__date_pattern_RES_S__A[ idx ].pattern_RES.gsub( / /,'' )
pattern_name = stringer[ stringer.index( '<' ) + 1 .. stringer.index( '>' ) - 1 ]
if ( not pattern_name.match?( /^fmt\d{3}__/ ) ) then
SE.puts "#{SE.lineno}: I shouldn't be here: pattern_name doesn't start with /fmtNNN__'#{pattern_name}'"
raise
end
initial__date_pattern_RES_S__A[ idx ].pattern_RES = stringer # get rid of the literal spaces.
initial__date_pattern_RES_S__A[ idx ].pattern_name = pattern_name
end
# Load date patterns to use
@date_pattern_RES_S__A = [ ]
initial__date_pattern_RES_S__A.each_index do | idx |
pattern_name = initial__date_pattern_RES_S__A[ idx ].pattern_name
if ( pattern_name.match?( /#{@option_H[ :pattern_name_RES ]}/ )) then
@date_pattern_RES_S__A.push( initial__date_pattern_RES_S__A[ idx ] )
end
end
if ( @date_pattern_RES_S__A.length == 0 ) then
SE.puts "#{SE.lineno}: No patterns selected based on RE: #{@option_H[ :pattern_name_RES ]}"
raise
end
# Check for duplicate pattern names
@pattern_cnt_H = {}
@date_pattern_RES_S__A.each_index do | idx |
pattern_name = date_pattern_RES_S__A[ idx ].pattern_name
if ( @pattern_cnt_H.key?( pattern_name ) ) then
SE.puts "#{SE.lineno}: I shouldn't be here: duplicate pattern_name '#{pattern_name}'"
raise
end
@pattern_cnt_H[ pattern_name ] = 0
end
@possible_date_C = Struct.new( :pattern_name,
:match_O,
)
@date_clump_C = Struct.new( :full_match_string,
:replace_uid,
:beginning_offset,
:date_match_S__A,
:morality,
:error_msg,
keyword_init: true
) do
def judge_date( judgement, input_error_msg, print = true )
SE.puts input_error_msg if ( print )
SE.puts "" if ( print )
self.error_msg = "" if ( error_msg == nil )
self.error_msg += " " if ( error_msg != "")
self.error_msg += input_error_msg
return if ( judgement == nil )
if ( morality == nil ) then
self.morality = judgement
return
end
if ( morality == :bad ) then
return if ( judgement == :bad )
SE.puts "#{SE.lineno}: Date morality was already bad, not changed to #{judgement}"
return
end
SE.puts "#{SE.lineno}: Date morality was already #{morality}, changed to #{judgement}"
self.morality = judgement
return
end
def from
return date_match_S__A[ 0 ]
end
def thru
return date_match_S__A[ 1 ]
end
def from_date
date_match_S = from
if ( date_match_S == nil ) then
SE.puts "#{SE.lineno}: I shouldn't be here: date_clump_S without a from date"
SE.q { :self }
raise
else
return date_match_S.as_date
end
end
def thru_date
date_match_S = thru
if ( date_match_S == nil ) then
return ""
else
return date_match_S.as_date
end
end
end
@date_match_C = Struct.new( :match_O,
:pattern_name,
:ymd_S,
:strptime_O,
:as_date,
) do
def all_pieces
return match_O.named_captures[ 'begin_M' ] +
match_O.named_captures[ 'date_M' ] +
match_O.named_captures[ 'end_M' ]
end
def piece( num )
piece_A = [ match_O.named_captures[ 'begin_M' ],
match_O.named_captures[ 'date_M' ],
match_O.named_captures[ 'end_M' ],
]
if ( num.is_a?( Integer )) then
return piece_A[ num ]
end
if ( num.is_a?( Range )) then
return piece_A[ num ].join('')
end
raise "#{SE.lineno}: I shouldn't be here: Was expect a number or range."
end
alias_method :pieces, :piece
end
@ymd_C = Struct.new( :year, :month, :day )
return
end
attr_reader :option_H, :date_pattern_RES_S__A, :pattern_cnt_H
def get_tree_of__possible_date_S__A_A( input_string, initial_offset, looking_for_a_thru_date = false, level = 0 )
tree_of__possible_date_S__A_A = [ ]
if ( level > 10 ) then
SE.puts "In to deep"
SE.q { 'tree_of__possible_date_S__A_A' }
raise
end
@date_pattern_RES_S__A.each do | date_pattern_RES_S |
if ( looking_for_a_thru_date ) then
regex = %r{(?<begin_M>#{@thru_date_begin_delim_RES})(?<date_M>#{date_pattern_RES_S.pattern_RES})(?<end_M>#{@end_delim_RES})}xi
else
regex = %r{(?<begin_M>#{@begin_delim_RES})(?<date_M>#{date_pattern_RES_S.pattern_RES})(?=([^\>]|$))(?<end_M>#{@end_delim_RES})}xi
#
# The lookahead '(?=([^\>]|$))' keeps the pattern from seeing the NNNN> of the date-clump literals
# and turning the NNNN into a year (with the '>' acting as a separator matched on \W).
end
scan_begin_offset = initial_offset + 0
loop do
break if ( scan_begin_offset >= input_string.maxoffset )
match_O = input_string.match( regex, scan_begin_offset )
break if ( match_O == nil )
match_string = match_O.named_captures[ 'begin_M' ] +
match_O.named_captures[ 'date_M' ] +
match_O.named_captures[ 'end_M' ]
match_offset = match_O.offset( :begin_M )[0]
match_length = match_string.length
if ( match_O.named_captures[ 'end_M' ] =~ /#{@thru_date_separator_RES}/ix ) then
result = get_tree_of__possible_date_S__A_A( input_string[ match_offset + match_length .. -1 ], 0, true, level + 1 )
tree_of__possible_date_S__A_A << [ @possible_date_C.new( date_pattern_RES_S.pattern_name, match_O ), result ]
else
tree_of__possible_date_S__A_A << [ @possible_date_C.new( date_pattern_RES_S.pattern_name, match_O ), [ ] ]
end
scan_begin_offset = match_offset + match_length
end
end
return tree_of__possible_date_S__A_A
end
def get_combinations_of__possible_date_S__A_A( tree_of__possible_date_S__A_A, combinations_of__possible_date_S__A_A = [], predecessors_A = [] )
tree_of__possible_date_S__A_A.each do | tree_of__possible_date_S__A |
new_predecessors_A = []
new_predecessors_A.concat( predecessors_A )
new_predecessors_A.append( tree_of__possible_date_S__A[0] )
if ( tree_of__possible_date_S__A[1].length > 0 ) then
get_combinations_of__possible_date_S__A_A( tree_of__possible_date_S__A[1], combinations_of__possible_date_S__A_A, new_predecessors_A)
else
combinations_of__possible_date_S__A_A << new_predecessors_A
end
end
return combinations_of__possible_date_S__A_A
end
def get_the_longest_date( input_string, initial_offset = 0 )
tree_of__possible_date_S__A_A = get_tree_of__possible_date_S__A_A( input_string, initial_offset )
if ( @option_H[ :debug_options ].include?( :print_date_tree )) then
SE.puts ""
SE.q { 'tree_of__possible_date_S__A_A' }
end
return [ ] if ( tree_of__possible_date_S__A_A.empty? )
combinations_of__possible_date_S__A_A = get_combinations_of__possible_date_S__A_A( tree_of__possible_date_S__A_A )
if ( @option_H[ :debug_options ].include?( :print_unsorted_combinations )) then
SE.puts ""
SE.q { 'combinations_of__possible_date_S__A_A' }
end
sorted_combinations_of__possible_date_S__A_A = combinations_of__possible_date_S__A_A.sort_by do | combinations_of__possible_date_S__A |
[
0 - combinations_of__possible_date_S__A.sum { | possible_date_S | possible_date_S.match_O[0].gsub( /\s/,"" ).length },
0 + combinations_of__possible_date_S__A.length,
0 + combinations_of__possible_date_S__A[ 0 ].match_O.offset( :begin_M )[0],
]
end
if ( @option_H[ :debug_options ].include?( :print_sorted_combinations )) then
SE.puts ""
SE.q { 'sorted_combinations_of__possible_date_S__A_A' }
end
#
# Return only the longest date ( element 0 after sorting) of all the dates found.
date_match_S__A = [ ]
sorted_combinations_of__possible_date_S__A_A[ 0 ].each do | possible_date_S |
date_match_S = @date_match_C.new( possible_date_S.match_O,
possible_date_S.pattern_name,
)
date_match_S__A << date_match_S
end
return date_match_S__A
end
def do_find( param_input_string )
date_clump_S__A = [ ]
process_input_string = "" + param_input_string # Make a new string, not a pointer.
loop_detector = 0
loop do
if ( ( loop_detector += 1 ) > 100) then
SE.puts "#{SE.lineno}: I shouldn't be here: loop_detector > 100"
raise
end
if ( @option_H[ :debug_options ].include?( :print_process_input_string )) then
SE.puts ""
SE.q { 'process_input_string' }
end
# date_match_S__A is the from date [element 0] and (optional) thru date [element 1].
date_match_S__A = get_the_longest_date( process_input_string )
break if ( date_match_S__A.empty? )
date_clump_S = @date_clump_C.new( full_match_string: "",
date_match_S__A: date_match_S__A,
)
date_clump_S__A << date_clump_S
date_clump_S.date_match_S__A.each_with_index do | date_match_S, date_match_I|
if ( date_match_I == 0 ) then
date_clump_S.replace_uid = "<" + @uid_string + "%010d" % (date_clump_S__A.length) + ">"
date_clump_S.beginning_offset = date_match_S.match_O.offset( :begin_M )[0]
end
if ( date_match_I == date_clump_S.date_match_S__A.maxindex ) then # If we're on the last one...
stringer = date_match_S.piece( 0..1 ) # Drop the ending delimiter from the match
else
stringer = date_match_S.all_pieces
end
date_clump_S.full_match_string += stringer
@pattern_cnt_H[ date_match_S.pattern_name ] += 1
date_match_S.ymd_S = @ymd_C.new( )
if ( date_match_S.pattern_name.match?( /__nn_nn_nn/ ) ) then
if ( date_match_S.match_O.named_captures[ 'nn_1st_M' ].length == 4 ) then
date_match_S.ymd_S.year = date_match_S.match_O.named_captures[ 'nn_1st_M' ]
date_match_S.ymd_S.month = date_match_S.match_O.named_captures[ 'nn_2nd_M' ]
date_match_S.ymd_S.day = date_match_S.match_O.named_captures[ 'nn_3rd_M' ]
else
case @option_H[ :nn_nn_nn_date_order ]
when :mm_dd_yy
date_match_S.ymd_S.month = date_match_S.match_O.named_captures[ 'nn_1st_M' ]
date_match_S.ymd_S.day = date_match_S.match_O.named_captures[ 'nn_2nd_M' ]
date_match_S.ymd_S.year = date_match_S.match_O.named_captures[ 'nn_3rd_M' ]
when :dd_mm_yy
date_match_S.ymd_S.day = date_match_S.match_O.named_captures[ 'nn_1st_M' ]
date_match_S.ymd_S.month = date_match_S.match_O.named_captures[ 'nn_2nd_M' ]
date_match_S.ymd_S.year = date_match_S.match_O.named_captures[ 'nn_3rd_M' ]
when :yy_mm_dd
date_match_S.ymd_S.year = date_match_S.match_O.named_captures[ 'nn_1st_M' ]
date_match_S.ymd_S.month = date_match_S.match_O.named_captures[ 'nn_2nd_M' ]
date_match_S.ymd_S.day = date_match_S.match_O.named_captures[ 'nn_3rd_M' ]
else
SE.puts "#{SE.lineno}: I shouldn't be here: #{date_match_S.pattern_name}: "+
"'#{date_match_S.all_pieces}' > "+
"invalid :nn_nn_nn_date_order value '#{@option_H[ :nn_nn_nn_date_order ]}'"
raise
end
end
elsif ( date_match_S.pattern_name.match?( /__nn_MMM_nn/ ) ) then
case @option_H[ :nn_mmm_nn_day_year_order ]
when :yy_mm_dd
date_match_S.ymd_S.year = date_match_S.match_O.named_captures[ 'nn_1st_M' ]
date_match_S.ymd_S.month = date_match_S.match_O.named_captures[ 'month_M' ]
date_match_S.ymd_S.day = date_match_S.match_O.named_captures[ 'nn_3rd_M' ]
when :dd_mm_yy
date_match_S.ymd_S.day = date_match_S.match_O.named_captures[ 'nn_1st_M' ]
date_match_S.ymd_S.month = date_match_S.match_O.named_captures[ 'month_M' ]
date_match_S.ymd_S.year = date_match_S.match_O.named_captures[ 'nn_3rd_M' ]
else
SE.puts "#{SE.lineno}: I shouldn't be here: #{date_match_S.pattern_name}: "+
"'#{date_match_S.all_pieces}' > "+
"invalid :nn_mmm_nn_day_year_order value '#{@option_H[ :nn_mmm_nn_day_year_order ]}'"
raise
end
elsif ( date_match_S.pattern_name.match?( /__(MMM_dd_dd_yy|MMM_dd_MMM_dd_yy|MMM_MMM_yy)/ )) then
date_match_S.ymd_S.year = date_match_S.match_O.named_captures[ 'year_M' ]
date_match_S.ymd_S.month = date_match_S.match_O.named_captures[ 'month_M' ]
date_match_S.ymd_S.day = date_match_S.match_O.named_captures[ 'day_M' ]
generated_thru_date_match_S = @date_match_C.new( "GENERATED_THRU_DATE" )
generated_thru_date_match_S.pattern_name = date_match_S.pattern_name
generated_thru_date_match_S.ymd_S = @ymd_C.new( )
generated_thru_date_match_S.ymd_S.year = date_match_S.match_O.named_captures[ 'year_M' ]
if ( date_match_S.match_O.named_captures.key?( 'thru_month_M' )) then
generated_thru_date_match_S.ymd_S.month = date_match_S.match_O.named_captures[ 'thru_month_M' ]
else
generated_thru_date_match_S.ymd_S.month = date_match_S.match_O.named_captures[ 'month_M' ]
end
if ( date_match_S.match_O.named_captures.key?( 'thru_day_M' )) then
generated_thru_date_match_S.ymd_S.day = date_match_S.match_O.named_captures[ 'thru_day_M' ]
else
generated_thru_date_match_S.ymd_S.day = date_match_S.match_O.named_captures[ 'day_M' ]
end
date_clump_S.date_match_S__A << generated_thru_date_match_S
else
date_match_S.ymd_S.year = date_match_S.match_O.named_captures[ 'year_M' ]
date_match_S.ymd_S.month = date_match_S.match_O.named_captures[ 'month_M' ]
date_match_S.ymd_S.day = date_match_S.match_O.named_captures[ 'day_M' ]
end
break if (date_match_S.piece( 2 ) !~ /#{@thru_date_separator_RES}/ix )
end
process_input_string[ date_clump_S.beginning_offset, date_clump_S.full_match_string.length ] = date_clump_S.replace_uid
end
date_clump_S__A.each do | date_clump_S |
date_clump_S.date_match_S__A.each_with_index do | date_match_S, date_match_I |
year = date_match_S.ymd_S.year
month = date_match_S.ymd_S.month
day = date_match_S.ymd_S.day
if ( year == nil or ( month == nil and day )) then
SE.puts "#{SE.lineno}: I shouldn't be here: #{date_match_S.pattern_name}, idx=#{date_match_I}: "+
"'#{date_match_S.all_pieces}' -> "+
"'#{date_match_S.piece( 1 )}' year == nil or ( month == nil and day)!"
SE.q { 'date_clump_S' }
raise
end
if ( day and day.integer? and day.length == 4 and year.length.between?( 1, 2 ) ) then
stringer = "#{SE.lineno}: Swapped day and year: #{date_match_S.pattern_name}, idx=#{date_match_I}: "+
"'#{date_match_S.piece( 1 )}' -> "+
"#{year+' '+month+' '+day}"
date_clump_S.judge_date( nil, stringer )
year, day = day, year
date_match_S.ymd_S.year = year
date_match_S.ymd_S.day = day
end
if ( year.length == 1 or year.length == 3 ) then
stringer = "#{SE.lineno}: bad date: #{date_match_S.pattern_name}, idx=#{date_match_I}: "+
"'#{date_match_S.all_pieces}' -> "+
"'#{date_match_S.piece( 1 )}' bad year."
date_clump_S.judge_date( :bad, stringer )
next
end
if ( month and month.integer? and month.length == 3 ) then
stringer = "#{SE.lineno}: bad date: #{date_match_S.pattern_name}, idx=#{date_match_I}: "+
"'#{date_match_S.all_pieces}' -> "+
"'#{date_match_S.piece( 1 )}' bad month."
date_clump_S.judge_date( :bad, stringer )
next
end
if ( year.length == 4 )
if ( year < @option_H[ :yyyy_min_value] ) then
stringer = "#{SE.lineno}: Date dropped: #{date_match_S.pattern_name}, idx=#{date_match_I}: "+
"'#{date_match_S.all_pieces}' -> "+
"'#{date_match_S.piece( 1 )}' year < min value #{@option_H[ :yyyy_min_value]}"
date_clump_S.judge_date( :bad, stringer )
next
end
if ( year > @option_H[ :yyyy_max_value] ) then
stringer = "#{SE.lineno}: Date dropped: #{date_match_S.pattern_name}, idx=#{date_match_I}: "+
"'#{date_match_S.all_pieces}' -> "+
"'#{date_match_S.piece( 1 )}' year > max value #{@option_H[ :yyyy_max_value]}"
date_clump_S.judge_date( :bad, stringer )
next
end
end
if ( day and not day.integer? ) then
stringer = "#{SE.lineno}: bad date: #{date_match_S.pattern_name}, idx=#{date_match_I}: "+
"'#{date_match_S.all_pieces}' -> "+
"'#{date_match_S.piece( 1 )}' day not numeric: '#{day}'"
date_clump_S.judge_date( :bad, stringer )
next
end
if ( not year.integer? ) then
stringer = "#{SE.lineno}: bad date: #{date_match_S.pattern_name}, idx=#{date_match_I}: "+
"'#{date_match_S.all_pieces}' -> "+
"'#{date_match_S.piece( 1 )}' year not numeric: '#{year}'"
date_clump_S.judge_date( :bad, stringer )
next
end
if ( year.length == 2 ) then
if ( date_match_I == 0 ) then
year = @option_H[ :default_century ] + year # These are strings
else
if ( date_clump_S.from.strptime_O ) then
year = date_clump_S.from_date[ 0 .. 1] + year # Take the century from the converted from_year, which is already in YYYY format
else
year = @option_H[ :default_century ] + year
end
end
end
if ( month and not month.integer? ) then
# month_match_O = month.match( /^(?<month_M>#{K.month_RES})/ ) # Only need for 'soft months' , which isn't programmed
# if ( not month_match_O == nil ) then
# month_named_captures = month_match_O.named_captures
# month = month_named_captures[ 'month_M' ]
# end
month.sub!( /\.$/, "" ) # Take the period off the months (eg Feb.)
end
testdate = year
if ( date_match_I == 0 )
testdate += (month) ? " #{month}" : " Jan"
testdate += (day) ? " #{day}" : " 01"
else
testdate += (month) ? " #{month}" : " Dec"
testdate += (day) ? " #{day}" : " 01" # This will be set to the end-of-month below
end
if ( month and month.integer? ) then
strptime_fmt = '%Y %m %d'
else
strptime_fmt = '%Y %b %d'
end
begin
date_match_S.strptime_O = Date::strptime( testdate, strptime_fmt )
rescue
stringer = "#{SE.lineno}: bad date: #{date_match_S.pattern_name}, idx=#{date_match_I}: "+
"'#{date_match_S.all_pieces}' -> "+
"'#{date_match_S.piece( 1 )}' -> "+
"'#{testdate}' -> '#{strptime_fmt}' strptime conversion failed"
date_clump_S.judge_date( :bad, stringer )
next
end
if ( date_match_S.strptime_O.year < 0 ) then
stringer = "#{SE.lineno}: bad date: #{date_match_S.pattern_name}, idx=#{date_match_I}: "+
"'#{date_match_S.all_pieces}' -> "+
"'#{date_match_S.piece( 1 )}' -> '#{date_match_S.strptime_O}' negative year"
date_clump_S.judge_date( :bad, stringer )
next
end
if ( day == nil and date_match_S.strptime_O.day != 1 ) then
SE.puts "#{SE.lineno}: I shouldn't be here: #{date_match_S.pattern_name}, idx=#{date_match_I}: "+
"'#{date_match_S.all_pieces}' -> "+
"'#{date_match_S.piece( 1 )}' -> '#{date_match_S.strptime_O} day != 1"
raise
end
if ( day == nil and date_match_I > 0 ) then
date_match_S.strptime_O = date_match_S.strptime_O.last_day_of_month
end
date_match_S.as_date = date_match_S.strptime_O.strftime( '%Y' )
date_match_S.as_date += date_match_S.strptime_O.strftime( '-%m' ) if ( month )
date_match_S.as_date += date_match_S.strptime_O.strftime( '-%d' ) if ( day )
end
next if ( date_clump_S.morality == :bad )
if ( date_clump_S.date_match_S__A.length > 2 ) then
stringer = "#{SE.lineno}: #{date_clump_S.date_match_S__A.length} run-on thru dates: "
date_clump_S.date_match_S__A.each do | date_match_S |
stringer += "'#{date_match_S.all_pieces}' "
end
date_clump_S.judge_date( :bad, stringer )
next
end
if ( date_clump_S.date_match_S__A.length == 1 and date_clump_S.from.ymd_S.year.length == 2 and date_clump_S.from.ymd_S.month == nil and date_clump_S.from.ymd_S.day == nil ) then
stringer = "#{SE.lineno}: probable bad date: #{date_clump_S.from.pattern_name}: "+
"'#{date_clump_S.from.all_pieces}' -> "+
"'#{date_clump_S.from.piece( 1 )}' isolated 2 digit number."
date_clump_S.judge_date( :bad, stringer )
next
end
if ( date_clump_S.date_match_S__A.length == 2 and date_clump_S.from.strptime_O and date_clump_S.thru.strptime_O ) then
if (date_clump_S.from.strptime_O > date_clump_S.thru.strptime_O ) then
stringer = "#{SE.lineno}: From date '#{date_clump_S.from_date}' > Thru date '#{date_clump_S.thru_date}'"
date_clump_S.judge_date( :bad, stringer )
next
end
end
if ( @option_H[ :debug_options ].include?( :print_good_dates ) ) then
stringer = "#{SE.lineno}: good date: #{date_match_S.pattern_name}: '#{date_match_S.all_pieces}'"
SE.puts stringer
end
date_clump_S.morality = :good
date_clump_S.error_msg = ""
end
process_input_string_with_all_dates_removed = process_input_string + ""
date_clump_S__A.each do | date_clump_S |
replace_option = @option_H[ :morality_replace_option ][ date_clump_S.morality ]
case replace_option
when :keep
begin
process_input_string[ date_clump_S.replace_uid ] = date_clump_S.full_match_string
process_input_string_with_all_dates_removed[ date_clump_S.replace_uid ] = ""
rescue
SE.puts "#{SE.lineno}: replace_uid failed"
SE.puts "process_input_string = #{process_input_string}"
SE.q { 'date_clump_S' }
raise
end
when :remove
begin
process_input_string[ date_clump_S.replace_uid ] = ""
process_input_string_with_all_dates_removed[ date_clump_S.replace_uid ] = ""
rescue
SE.puts "#{SE.lineno}: replace_uid failed"
SE.puts "process_input_string = #{process_input_string}"
SE.q { 'date_clump_S' }
raise
end
else
SE.puts "#{SE.lineno}: I shouldn't be here, unknown replace_option for morality '#{date_clump_S.morality}' -> "+
"'#{@option_H[ :morality_replace_option ][ date_clump_S.morality ]}'"
SE.q { 'date_clump_S' }
raise
end
end
@good__date_clump_S__A = [ ]
@bad__date_clump_S__A = [ ]
date_clump_S__A.each do | date_clump_S |
case date_clump_S.morality
when :good
@good__date_clump_S__A << date_clump_S
when :bad
@bad__date_clump_S__A << date_clump_S
else
SE.puts "#{SE.lineno}: I shouldn't be here: amoral date: '#{date_clump_S.morality}', #{date_clump_S}"
raise
end
end
if ( @option_H[ :sort ] ) then
@good__date_clump_S__A = @good__date_clump_S__A.sort_by { | date_clump_S | [ date_clump_S.from_date ] }
prev_date=''
@good__date_clump_S__A.each_with_index do | date_clump_S, idx |
if ( date_clump_S.from_date < prev_date ) then
SE.puts "#{SE.lineno}: Warning: Dates overlap! good from-date '#{date_clump_S.from_date} at element #{idx} "+
"< previous date #{prev_date}, there may be others."
SE.puts ""
break
end
prev_date = (date_clump_S.thru_date == '') ? date_clump_S.from_date : date_clump_S.thru_date
end
end
case @option_H[ :date_string_composition ]
when :dates_in_text
if (process_input_string_with_all_dates_removed =~ K.month_RE ) then
SE.puts "#{SE.lineno}: Warning possible ummatched date '#{$~}' in '#{process_input_string_with_all_dates_removed}'"
SE.puts ""
end
when :only_dates
if (process_input_string_with_all_dates_removed !~ /^\s*$/ ) then
SE.puts "#{SE.lineno}: Unconverted dates in: '#{param_input_string}'"
SE.puts "#{SE.lineno}: Extra text: '#{process_input_string_with_all_dates_removed}'" if ( param_input_string != process_input_string_with_all_dates_removed )
if ( @good__date_clump_S__A.length > 0 ) then
stringer = @good__date_clump_S__A.map do | date_clump_S |
date_clump_S.date_match_S__A.map do | date_match_S |
date_match_S.as_date
end
end.join( "','")
SE.puts "#{SE.lineno}: Good dates: '#{stringer}' moved to bad-dates array after row #{@bad__date_clump_S__A.length}"
@bad__date_clump_S__A += @good__date_clump_S__A
@good__date_clump_S__A = [ ]
end
SE.puts ""
process_input_string = param_input_string
end
end
return process_input_string
end
attr_reader :good__date_clump_S__A, :bad__date_clump_S__A
end