So let's say I have some airline flight data. One field is Origin Airport and another field is Destination Airport. I'd like to group observations by route (a unique Origin to Destination combination). The issue is that for each unique route the corresponding return route would also need to be included. For example, if I had Airport A to Airport B for one bunch of observation, but Airport B to Airport A in the next few observation, I'd like them all to have the same route ID.
I can use SAS, Stata, or R for this. Even Python, if it's easier.
Rep. code below:
df1 <- structure(list(airl = c("US", "US", "US", "US", "US", "US", "US",
"US", "US", "US", "US", "US", "US", "US", "US", "US", "US", "US",
"US", "US", "US", "US", "US", "US", "US", "US", "US", "US", "US",
"US", "US", "US", "US", "US", "US", "US", "US", "US", "US", "US",
"US", "US", "US", "US", "US", "US", "US", "US", "US", "US", "US",
"US", "US", "US", "US", "US", "US", "US", "US", "US", "US", "US",
"US", "US", "US", "US", "US", "US", "US"), ORIGIN = c("ABE",
"ABE", "ABE", "ABE", "ABE", "ABE", "ABE", "ABE", "ABE", "ABE",
"ABE", "ABE", "ABE", "ABE", "ABE", "ABE", "ABE", "ABE", "ABE",
"ABE", "ABE", "ABE", "ABE", "ABE", "ABE", "ABE", "ABE", "ABE",
"ABE", "ABE", "ABE", "ABE", "ABE", "ABE", "CLT", "CLT", "CLT",
"CLT", "CLT", "CLT", "CLT", "CLT", "CLT", "CLT", "CLT", "CLT",
"CLT", "CLT", "CLT", "CLT", "CLT", "CLT", "PHL", "PHL", "PHL",
"PHL", "PHL", "PHL", "PHL", "PHL", "PHL", "PHL", "PHL", "PHL",
"PHL", "PHL", "PHL", "PHL", "PHL"), DESTINATION = c("CLT", "CLT",
"CLT", "CLT", "CLT", "CLT", "CLT", "CLT", "CLT", "CLT", "CLT",
"CLT", "CLT", "CLT", "CLT", "CLT", "CLT", "PHL", "PHL", "PHL",
"PHL", "PHL", "PHL", "PHL", "PHL", "PHL", "PHL", "PHL", "PHL",
"PHL", "PHL", "PHL", "PHL", "PHL", "ABE", "ABE", "ABE", "ABE",
"ABE", "ABE", "ABE", "ABE", "ABE", "ABE", "ABE", "ABE", "ABE",
"ABE", "ABE", "ABE", "ABE", "ABE", "ABE", "ABE", "ABE", "ABE",
"ABE", "ABE", "ABE", "ABE", "ABE", "ABE", "ABE", "ABE", "ABE",
"ABE", "ABE", "ABE", "ABE"), miles = c(480, 480, 480, 480, 480,
480, 480, 480, 480, 480, 480, 480, 480, 480, 480, 480, 480, 54,
54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54,
480, 480, 480, 480, 480, 480, 480, 480, 480, 480, 480, 480, 480,
480, 480, 480, 480, 480, 54, 54, 54, 54, 54, 54, 54, 54, 54,
54, 54, 54, 54, 54, 54, 54, 54), orig_area = c(23, 23, 23, 23,
23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 36, 36,
36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36,
23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
23), dest_area = c(36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36,
36, 36, 36, 36, 36, 36, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
23, 23, 23, 23, 23, 23, 23, 23, 23, 23), month = c(1, 2, 3, 4,
5, 6, 7, 8, 9, 10, 11, 12, 1, 2, 3, 4, 5, 4, 5, 6, 7, 8, 9, 10,
11, 12, 1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 1, 2, 3, 4, 5, 6,
7, 8, 9, 10, 11, 12, 1, 2, 3, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
1, 2, 3, 4, 5, 6, 7), freq = c(88, 80, 89, 78, 88, 83, 85, 80,
76, 79, 76, 81, 86, 65, 62, 60, 82, 137, 138, 142, 144, 149,
147, 150, 143, 150, 138, 128, 151, 145, 148, 146, 147, 149, 79,
76, 81, 86, 65, 62, 60, 82, 82, 82, 84, 81, 83, 81, 85, 84, 76,
85, 143, 137, 138, 142, 143, 151, 147, 150, 143, 150, 137, 128,
151, 145, 148, 146, 147), seats = c(8146, 7352, 7599, 6920, 6759,
6060, 6189, 5939, 6137, 6504, 6440, 6804, 6862, 5330, 5242, 5068,
6204, 6460, 6276, 6047, 6095, 6306, 6102, 6265, 7085, 7344, 6809,
6348, 6965, 6626, 6893, 6741, 6765, 6865, 6504, 6440, 6804, 6862,
5330, 5242, 5068, 6204, 6104, 6030, 6278, 6034, 6944, 6816, 6544,
6494, 5872, 6544, 6747, 6460, 6276, 6034, 6058, 6380, 6102, 6278,
7085, 7344, 6759, 6348, 6952, 6613, 6919, 6728, 6765), year = c(2009,
2009, 2009, 2009, 2009, 2009, 2009, 2009, 2009, 2009, 2009, 2009,
2010, 2010, 2010, 2010, 2010, 2010, 2010, 2010, 2010, 2010, 2010,
2010, 2010, 2010, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011,
2009, 2009, 2009, 2010, 2010, 2010, 2010, 2010, 2010, 2010, 2010,
2010, 2010, 2010, 2010, 2011, 2011, 2011, 2010, 2010, 2010, 2010,
2010, 2010, 2010, 2010, 2010, 2010, 2011, 2011, 2011, 2011, 2011,
2011, 2011)), .Names = c("airl", "ORIGIN", "DESTINATION", "miles",
"orig_area", "dest_area", "month", "freq", "seats", "year"), class = "data.frame", row.names = c(NA,
69L))