Compare commits
917 commits
Author | SHA1 | Date | |
---|---|---|---|
8c293a4684 | |||
0846aed44a | |||
0e23dcb8ad | |||
a01b18a0f2 | |||
63b0607f58 | |||
cf5f7bfd16 | |||
e96b167b71 | |||
105ceaf386 | |||
c9863f094a | |||
804d0eb661 | |||
5fbe64dc71 | |||
8b3b5a73e8 | |||
8e60a12517 | |||
0f94c98733 | |||
7e93101617 | |||
8f17bcf43b | |||
9b23082a78 | |||
d18c30b481 | |||
f4059d5c72 | |||
59c25d21df | |||
579c5ab8eb | |||
468b5a33ae | |||
37dd7f11a8 | |||
74fb9e0ad7 | |||
e2a14b0e14 | |||
2589e29304 | |||
241021fa39 | |||
12311029e4 | |||
77a01e1627 | |||
b64f508025 | |||
8c57dc2283 | |||
4f07e92c5e | |||
f40ac35f4a | |||
bdce6101ae | |||
271da4e6f3 | |||
e0e780a272 | |||
aced164560 | |||
8fffa20795 | |||
3fdaf35306 | |||
9a6e42fb9c | |||
954df88c86 | |||
b7aae727e5 | |||
b8442dbb15 | |||
b6edc36753 | |||
c4bece2f58 | |||
874c7e3117 | |||
2bafb1b99b | |||
0bb94040d6 | |||
13887ca7e1 | |||
7cd1a70f59 | |||
fe9cc7f29f | |||
816b7e2726 | |||
e92929ef07 | |||
c63a8842d9 | |||
7fef403757 | |||
2aea7d0d48 | |||
57ac0130a6 | |||
00c4307694 | |||
da74096cde | |||
b3e4777206 | |||
8d43cdeef7 | |||
db11e57111 | |||
83f45e7f60 | |||
5d3a539eda | |||
2e2dfe671b | |||
603e7de04d | |||
66049a4021 | |||
7cf096012e | |||
af6222e06c | |||
4e131640ad | |||
8c01a9e7a0 | |||
d0da9be376 | |||
e32d322dbd | |||
58e1d1a8f9 | |||
4ba5df6b37 | |||
b4fccd7ef8 | |||
15a9d25a9d | |||
ac91c9089c | |||
3aa740e979 | |||
1bc20f238e | |||
628739d0b8 | |||
7e3b11caf8 | |||
8af00b20bc | |||
76b441cd62 | |||
d4bfe8fa19 | |||
614c19be10 | |||
8cfc314038 | |||
47e49a2e98 | |||
8feb6517f1 | |||
bfd2d31b7b | |||
921b2d0888 | |||
5427ceb29a | |||
ee095d4814 | |||
9c3c5436b5 | |||
82230a97bc | |||
c4f636c388 | |||
002a2dac43 | |||
1dc5ead4f6 | |||
60ce138a52 | |||
0873a4a2b2 | |||
96cd7d7147 | |||
51b09a77ed | |||
3136a6488c | |||
2524070bd0 | |||
0a3b3d7b7c | |||
b30ced9be9 | |||
3278e67197 | |||
45429be27c | |||
d056647a53 | |||
69fa1f3f09 | |||
b438f81a43 | |||
324242a9bc | |||
e18014cc8a | |||
ef7fcce1cc | |||
831f49daa6 | |||
7b7167643f | |||
175513fbb7 | |||
5cb3c2c635 | |||
fecb65c53a | |||
ad12fc1b7a | |||
48c96beba2 | |||
2d2ed58b34 | |||
9ee13aea23 | |||
87104e7e6a | |||
21bf90f521 | |||
42416db8b9 | |||
4143c53ff1 | |||
9ba62f8c97 | |||
1385545e26 | |||
5341d6f12c | |||
49727aea6e | |||
0a586425d0 | |||
54a800d357 | |||
25fdd28037 | |||
f3c7d796aa | |||
77711c243a | |||
3c7f85725e | |||
dfc21295e3 | |||
5300758b3b | |||
cfd4bad1ef | |||
0e90a2e900 | |||
47a8736f77 | |||
df30a3a3ac | |||
dc5a9ef497 | |||
3d0ac9e483 | |||
325883e441 | |||
14e98f014b | |||
b536a486b6 | |||
02b6e66941 | |||
e7629d7004 | |||
0ce2585f7f | |||
b7d21161fb | |||
d4664d784f | |||
3b5f8bca67 | |||
c834314086 | |||
7bb2a9adbb | |||
57e59db458 | |||
e57932aedf | |||
ca33dee265 | |||
9c067ad74f | |||
3906386838 | |||
7fef6c4023 | |||
398f7b293a | |||
cd05bc388e | |||
1dff7500e7 | |||
06816098dc | |||
f4598c4bec | |||
c4a9da06f6 | |||
5c343ef790 | |||
106d7596b1 | |||
7bd957aafa | |||
d4f7deaa68 | |||
2f2b5b749c | |||
95749584ec | |||
0a9ecac410 | |||
e0a36f4eab | |||
35645da241 | |||
5dbb4d00d4 | |||
0767da14c2 | |||
d60b4e7fdd | |||
cd6bcd82ef | |||
4b160c2611 | |||
44e4c16b76 | |||
55c95495b2 | |||
b47b90f233 | |||
9d63125724 | |||
2bbf1b644e | |||
f22a8aec4d | |||
5d76fcd5aa | |||
7eb2ab6d7d | |||
9545407896 | |||
89653c4bad | |||
4fc0d5dc1d | |||
607d963450 | |||
1f1e7dc63d | |||
7ae318fb20 | |||
27ca92ef15 | |||
af3f98f59c | |||
23e20e6ddc | |||
cb3415c62f | |||
5f443fddff | |||
0731de788d | |||
395bf9180a | |||
ef82387f84 | |||
798ed728f5 | |||
8ab13b4480 | |||
7100291ed9 | |||
59e57cee84 | |||
36e32d4bff | |||
febad9c06c | |||
1157c31be1 | |||
86e451d49e | |||
9277903308 | |||
7d4eb47643 | |||
4f876eecbc | |||
7315afeafd | |||
3fd5bad407 | |||
8c59329ffa | |||
2d365b612b | |||
7d4916919d | |||
decb13b5db | |||
efea01e56f | |||
eb8f9d5876 | |||
e4a44f1e25 | |||
ad172841e2 | |||
e068c9ce56 | |||
53d7ce2e5d | |||
9f3dcece4d | |||
2bdeaf2660 | |||
12982c00cd | |||
1abb7768c3 | |||
f49a1d7a2d | |||
a599169399 | |||
e8d767050f | |||
90a2eac90d | |||
a620ae91a1 | |||
919abb09ef | |||
a6940987f4 | |||
12104d54f1 | |||
aede4d559a | |||
f57590cfa0 | |||
2e68850d0f | |||
ac8855bc14 | |||
bfd481739b | |||
81c49de911 | |||
c410682cc8 | |||
1ad2b68e03 | |||
274407537e | |||
d64acc25f5 | |||
dbd0c6cd42 | |||
4917fae797 | |||
5775c0ab9f | |||
484bde9b13 | |||
4e050c50d6 | |||
90b680935e | |||
68e367453b | |||
b921d03705 | |||
2c93537aea | |||
5a3ff887c4 | |||
806bd76f87 | |||
a2aa739c37 | |||
81b7fe853b | |||
5f779c734e | |||
06988c40b3 | |||
160ee372b9 | |||
7645319510 | |||
71f84420cb | |||
6b7e551934 | |||
6e0c642652 | |||
5adb9f9545 | |||
9deef63fdd | |||
85b216551f | |||
0177b434c2 | |||
57b3bb3134 | |||
49d16267a2 | |||
3811ec37fb | |||
8ec45a9302 | |||
ac3a8e913d | |||
850faffc29 | |||
e4fcacfd4f | |||
e564870cd6 | |||
af0a545c16 | |||
a487320e81 | |||
36ff95de6b | |||
5288b79d1b | |||
9f354e9e52 | |||
92dca3bd0e | |||
5333705440 | |||
1530456cf7 | |||
9ccc9e6863 | |||
8718295ee5 | |||
cc80acd6b5 | |||
f0aebdf5f1 | |||
f670b347ae | |||
d0d72c8229 | |||
0eeb4b46dc | |||
2b50ee0724 | |||
dd8d74ee25 | |||
4a86482756 | |||
8925643331 | |||
2dd446a402 | |||
6dd17c8762 | |||
f19171a1b4 | |||
fc279705c1 | |||
b4dd89cddc | |||
17939fe47c | |||
53562f4873 | |||
8c3af7029e | |||
bd802df38c | |||
f05e909008 | |||
4be0f5ec19 | |||
801784c46d | |||
d25f3fe008 | |||
e493ab048a | |||
8104ce3a8d | |||
f716d982b0 | |||
03d0aec4f6 | |||
4d3f0f9862 | |||
c6c6002ab2 | |||
9b23f273fc | |||
eeb2054606 | |||
327cce5581 | |||
2d6e25d1ac | |||
01923fda0e | |||
e004ccd148 | |||
80baab8de7 | |||
668fe80127 | |||
33312687ac | |||
afe3b71f59 | |||
063caf0126 | |||
89e24eca62 | |||
483f179ccc | |||
ee2075697b | |||
aee6f4add9 | |||
940d646d30 | |||
edc2db0ded | |||
56575dc390 | |||
defd6bca77 | |||
afc2a6416b | |||
3040a35306 | |||
87f283cc98 | |||
dffaaff505 | |||
7bca303b1b | |||
dbe8733fd4 | |||
3cdae99490 | |||
44453b1707 | |||
7a1663db51 | |||
1a4ff07f78 | |||
b58eebb51f | |||
77aaee96f3 | |||
900f9a93ee | |||
d6e45de09b | |||
8826fc5aa9 | |||
381e3c29fa | |||
a2f00c7236 | |||
2f8ca766c6 | |||
d03a5e556e | |||
c36c4c0ce2 | |||
7f1c929a08 | |||
d5ef991b3a | |||
aa55a92791 | |||
6efcf1ce7e | |||
390ce57f46 | |||
1319eeb6da | |||
4abd023c10 | |||
8db9d0bcc4 | |||
c4aa617737 | |||
bd34c37052 | |||
06dc5a6146 | |||
edfeb653a4 | |||
469a7783b8 | |||
ffd07f38ba | |||
fdda8f95e6 | |||
528f5c567d | |||
d53b3b7274 | |||
e8998da2f0 | |||
b632fd089b | |||
1a52dfdcbc | |||
8b1a3d9abc | |||
640001a7f5 | |||
1d187fcf65 | |||
31be3a916e | |||
3dacaf0872 | |||
ccafebf5fe | |||
e5be624f1e | |||
71930e06a8 | |||
fc42587a8f | |||
6755d15675 | |||
72238f39ba | |||
7c27b7bf12 | |||
a8bc4f999e | |||
c5c010bce0 | |||
e009fab504 | |||
b4ae513e71 | |||
9fd8b29833 | |||
e500bc4ad4 | |||
6eeadc8821 | |||
4be75fd48a | |||
8ba2d0bb55 | |||
5c8702da44 | |||
8be3efb6e4 | |||
349abbfb44 | |||
19e97174f3 | |||
6dcef83666 | |||
434aeb8feb | |||
a2f010c40d | |||
6839c65bd6 | |||
79fba4ac4a | |||
6caa02adb1 | |||
12a508c898 | |||
bf50618590 | |||
c89de29f72 | |||
9a1e1ebea1 | |||
be00bfb1bb | |||
6b78a23484 | |||
80bb4a8b5e | |||
346df4726d | |||
fef2fc864b | |||
87959028e5 | |||
f47688812d | |||
b74e93d2b7 | |||
0b5f18430f | |||
323b2d2b03 | |||
4c42469c0c | |||
a511c706c7 | |||
1b23e38ce4 | |||
e2582ecb3e | |||
47a4951279 | |||
5aae6b3df8 | |||
527a8af7b7 | |||
4395dd4646 | |||
da72d8ac2d | |||
a104a154fc | |||
da8c64ec51 | |||
6e34493bb1 | |||
cf6905db28 | |||
bfa6e4da5a | |||
f4c1adaa9a | |||
9d6e54148b | |||
827f1ab80e | |||
32b29f5413 | |||
8fb5103d09 | |||
830e4f2830 | |||
3b28ad24b3 | |||
32f72c35ec | |||
122aa2839b | |||
a75cc0dee9 | |||
b9c0edaebc | |||
7c401b1461 | |||
c581bef790 | |||
fa04d61eb8 | |||
7016603763 | |||
71da1556e5 | |||
36886dffe0 | |||
200916a150 | |||
f768a7d61c | |||
aa750f9ab4 | |||
739f97edcc | |||
ef37712115 | |||
17226a4f0b | |||
c7a5ec4376 | |||
aea30d2b44 | |||
8a7d21e159 | |||
4fd903cbe4 | |||
fca3184950 | |||
99fe3312a4 | |||
6f9430fc71 | |||
bc3ebe9580 | |||
6c086e70f7 | |||
ff36aeb85c | |||
c9bd2e76c9 | |||
edbd0e90a4 | |||
a65bbd7fda | |||
7e70175e4c | |||
db8b1c7547 | |||
a2e22e894a | |||
4ab1d6d6e0 | |||
283ad164e5 | |||
f6d89097f8 | |||
fc5f4a0405 | |||
77f9a7d523 | |||
e642ad68d4 | |||
00defe3b87 | |||
9752ef4b2a | |||
66aef3eab9 | |||
2017994e81 | |||
c9cde54a72 | |||
ab96a3ba97 | |||
fba70dcf18 | |||
a8c2136270 | |||
a86a41e6a5 | |||
afa3e2548f | |||
eda12e5274 | |||
711f8b0c76 | |||
14195157de | |||
b352549088 | |||
db0f90b4e3 | |||
fe95394b3b | |||
310cc123df | |||
600a85cbc8 | |||
e6d2980db3 | |||
2c54cd740a | |||
39935c58d9 | |||
9931839d14 | |||
760e59e1f7 | |||
3c6e9f6ccf | |||
e1a4ac063c | |||
7fcbf623a0 | |||
6a20548269 | |||
17499baf61 | |||
e6551bb797 | |||
db46676dec | |||
667aa395e5 | |||
cb41d4749a | |||
3bc10ce1aa | |||
fbf8a2748e | |||
a28c2d3c73 | |||
12c040d85d | |||
f5d11107a7 | |||
8cdf926211 | |||
7438543f49 | |||
ca495a6677 | |||
214c883a10 | |||
386d5ea41c | |||
6767777944 | |||
d960bc0b7b | |||
2eab4052c5 | |||
1c4cfbb580 | |||
bd5afe5333 | |||
aa43ebd1b8 | |||
7a2c90d321 | |||
91553854e6 | |||
304028cf42 | |||
d46c25913a | |||
44ad34901c | |||
c0cef487bd | |||
298509c7fa | |||
92c7f9bcff | |||
b355fb6500 | |||
61489dc73d | |||
71664dc70a | |||
ee9dec16bf | |||
b37ff0714f | |||
aefe8b79b6 | |||
0d407d7a39 | |||
e78ecd5626 | |||
e8abec43a6 | |||
f483f24e15 | |||
5e81160e5f | |||
8eb374eec6 | |||
d8752b15fa | |||
c85ae3fc69 | |||
52e5120110 | |||
9e3e9fa3ef | |||
48dca9e5ee | |||
59ab5d8777 | |||
ab7a0f6a51 | |||
e672e28a12 | |||
4b195f2b53 | |||
62dedb6c95 | |||
5758aad48b | |||
77bdbbac63 | |||
7d71f8ffab | |||
308853d531 | |||
e35dd9e5d0 | |||
bd9f276acc | |||
3da58dbd5d | |||
ef0b2d437f | |||
36291d5ea6 | |||
af95f3ff66 | |||
aa8032e95f | |||
0c960a4d0c | |||
34c8a9a5d0 | |||
37ca413424 | |||
49c82dc12a | |||
9aa1383b43 | |||
97b10ee4e3 | |||
c90d98ac72 | |||
7228bc572c | |||
9bceafc3e9 | |||
942ca2afea | |||
6704cd1dc0 | |||
5fea34ffce | |||
a291104144 | |||
9cb4dd4cf3 | |||
1768096b85 | |||
b255271016 | |||
2384c03170 | |||
500cee4bae | |||
936cbd4747 | |||
9880d46853 | |||
b5b163084f | |||
c2c3d03205 | |||
4b49991cd6 | |||
ab29e17511 | |||
4e35f0db2b | |||
6c98829fd8 | |||
bf078cbaf8 | |||
702295f1ea | |||
dc9d02a28c | |||
f78856315d | |||
1e1dae8eaa | |||
2385867afb | |||
a05fa1a965 | |||
c9c864b71b | |||
a49d87e154 | |||
32c9d6184c | |||
75d74a5362 | |||
ab9dea0347 | |||
7d69c9a7af | |||
a170356738 | |||
f06e8f3ac4 | |||
0fe28deee9 | |||
a32dd6d0fe | |||
7a436d0481 | |||
3d2e11dc1d | |||
44889d5264 | |||
5d86c2d400 | |||
2e879949f5 | |||
265505efc7 | |||
5e6159ade3 | |||
cbcbf400bb | |||
f0d3cfefc0 | |||
8bd4b8b3a9 | |||
8753fa0e45 | |||
49b0fac7c7 | |||
247fe3e6f7 | |||
58150570a3 | |||
443cf1af01 | |||
f2946c0a87 | |||
20f525bd0d | |||
91bebe5f68 | |||
21e8f0f8b9 | |||
56347da07e | |||
3cb51e638b | |||
1cb8240eb6 | |||
771cc711e4 | |||
79105f9f84 | |||
0007912ad5 | |||
b3c51f49ce | |||
d8b741191d | |||
82860a3341 | |||
5b23f6688a | |||
7174121af2 | |||
c13f2806fa | |||
0d78e16b2d | |||
faa3c20713 | |||
72b2e30e90 | |||
3ca2df067f | |||
1215bc69de | |||
ba2ab25c2c | |||
c9d2a23a5f | |||
2b885451e7 | |||
902f796178 | |||
6a1e652628 | |||
03c8d524a4 | |||
540b237da6 | |||
0929469bef | |||
a9028434c4 | |||
f13e029e47 | |||
a54a1c6dfa | |||
423155a846 | |||
ed26907e0d | |||
3e9a846e2e | |||
bc7ccc0964 | |||
29441e7244 | |||
8eddaeaff4 | |||
9941215643 | |||
0d72bf6431 | |||
3a093d0844 | |||
6aab009204 | |||
28b7deb6d3 | |||
4cd25bcad9 | |||
e042d985b5 | |||
06261cc5cd | |||
61ba21639d | |||
7c1b2da05c | |||
9e546a6718 | |||
447855cb74 | |||
b08c31a1db | |||
c2d3cfd50f | |||
1bf1db707c | |||
959b55a939 | |||
a93813ca45 | |||
8e3e5a62d9 | |||
1b191e7a0e | |||
b8faffaf1f | |||
56966ea6b4 | |||
36b6aafbc1 | |||
f9809caa42 | |||
d977595bde | |||
9417e0cc04 | |||
4d91cf7c0f | |||
ba6cf42096 | |||
ae0269e13b | |||
285d422c0e | |||
af348a05dd | |||
2b9dc1b96c | |||
fc6e5872b2 | |||
6657f0803d | |||
3e18997652 | |||
6d6327a396 | |||
f941161014 | |||
312769cb66 | |||
f7989ca518 | |||
d3c8897f6a | |||
945116063f | |||
3703d2b9b9 | |||
50531c7b3e | |||
dd522c18d4 | |||
f8347e2d31 | |||
27a8b497a8 | |||
271c0b989d | |||
92386000d8 | |||
48233fad36 | |||
1046bcdf69 | |||
d78c4ca78e | |||
eca5da7f46 | |||
25f70463f3 | |||
839b6f50a9 | |||
a471629287 | |||
09e42ff5f9 | |||
d2cc3e8b6a | |||
f138b9210e | |||
14b63487bc | |||
cfd92a8d14 | |||
1f62a7ccd6 | |||
d8a767e8da | |||
933be21392 | |||
afe618916b | |||
326eb484cc | |||
db6c64d0ab | |||
7d30af3559 | |||
735833503f | |||
862121cac6 | |||
36f516e3f0 | |||
95876b3400 | |||
7c2b7b0e83 | |||
dd1831b0ea | |||
33f4947456 | |||
cf1029de80 | |||
1422591bf4 | |||
5a2e045c77 | |||
aeb9afdc66 | |||
b705c31630 | |||
6f86dbd552 | |||
5e914b5234 | |||
ac08a639ba | |||
9e6ec9f1ca | |||
ea42471932 | |||
c01fc39671 | |||
7b33ec07fd | |||
2dce3108a1 | |||
ad3aeece07 | |||
62d99a9cad | |||
d096580da7 | |||
62e104653d | |||
caefb591dd | |||
be68d4eb1c | |||
b699639b5c | |||
125b78a348 | |||
2a0dd4f6ac | |||
d1c0a7ece2 | |||
06e7e81de0 | |||
4146f181c7 | |||
a55f35c025 | |||
37a91aa4df | |||
0c6a8e46a2 | |||
0652f53b50 | |||
6fd7aca981 | |||
0973e1e451 | |||
3bc305c037 | |||
228cd5f687 | |||
28f5ea69c3 | |||
e0d321c785 | |||
a75e94e43e | |||
ae5ed75226 | |||
bb85fb8934 | |||
1b40b16970 | |||
efffc3ee3f | |||
dd2804eb97 | |||
714b6c5b72 | |||
f71a3c5326 | |||
289f7e7447 | |||
8e54986357 | |||
866d94f37e | |||
4bbe41a2f8 | |||
eac2381a0a | |||
70a992d299 | |||
e646ae4a84 | |||
722e6cb73a | |||
64bc10f6aa | |||
91ae9924c3 | |||
a7f1db14e5 | |||
a72abd6603 | |||
c20fab2594 | |||
81293db8e9 | |||
358357590f | |||
cd2c511db2 | |||
b1f0632a80 | |||
f2415b6bd0 | |||
5ef58f147f | |||
f573038a21 | |||
69e21e46a2 | |||
0d839329e5 | |||
be613949fe | |||
12b5fd351e | |||
d8a1204d8b | |||
185335e60b | |||
4143b86467 | |||
f034c1a87c | |||
e179c09fae | |||
975b2753c4 | |||
8179b845b7 | |||
33324cde26 | |||
3d5a2fc1a7 | |||
3ac4ac3f3d | |||
fdff302460 | |||
ec23430027 | |||
30fe10e689 | |||
99c1e4c455 | |||
f63fde6ae7 | |||
d8ab14d879 | |||
825ff34cfd | |||
851c10f6e9 | |||
c6b1bb7f3e | |||
0d9b122736 | |||
fc42afbabe | |||
e0a2d2eda0 | |||
d731613f8a | |||
41bc2db10a | |||
b44b458363 | |||
4c85e7c015 | |||
af8e73d28a | |||
c1f8a3bb65 | |||
133beaae3d | |||
10bc42a063 | |||
45ed203859 | |||
60c93dea59 | |||
332e08d038 | |||
e7bf66f28b | |||
c6a346eb90 | |||
37fdd87ab9 | |||
c40ee547b5 | |||
825fc5da53 | |||
2c99c584a5 | |||
56428883a1 | |||
51a0b2150d | |||
1ff5dbbb75 | |||
39e31794db | |||
5f7a69cdd9 | |||
79078d8e0b | |||
f9ac4f925b | |||
37a3d83725 | |||
2ba763e03b | |||
88a74ee26a | |||
9d2b92f370 | |||
da0113a06f | |||
14607eeecf | |||
e93e494173 | |||
50d9163dd8 | |||
50d800baf1 | |||
1a708bd0fc | |||
b06859ffec | |||
c3927bb5a1 | |||
df7f5381a0 | |||
a92ddb21f9 | |||
74148ceec3 | |||
5dee5f89a5 | |||
af1f9fd365 | |||
540e1e8a6e | |||
05da5e7aac | |||
fd4958c06a | |||
0e007abd64 | |||
148e20d1d6 | |||
c5a2eed80f | |||
d6194c57d9 | |||
cd87a4a120 | |||
08cddf4c83 | |||
88fa9e742d | |||
1c17f174a8 | |||
9b36336ac3 | |||
35e551f20c | |||
0f2bda9c34 | |||
8ab694bcc1 | |||
898f59d035 | |||
6b6db37185 | |||
d4a5100128 | |||
22047338e2 | |||
b16cdd3cbb | |||
2a8394a48c | |||
eac4404bbf | |||
fae49d50da | |||
7130525ece | |||
2bf1e03ee1 | |||
15a91e5784 | |||
344201a70d | |||
92e47adb43 | |||
4d385fda60 | |||
82dcd2f63d | |||
08de21a364 | |||
af7d3d9151 | |||
280147282b | |||
b7baf07fb5 | |||
aece2273fb | |||
f807efe4d5 | |||
743d887927 | |||
da5492858c | |||
cebfc713d2 | |||
f522154214 | |||
27cd3ee991 | |||
29873331e6 | |||
8a3dcd68a3 | |||
ac323f2abe | |||
32d26fa956 | |||
137481cf3e | |||
9b63c55d3e |
2
.gitattributes
vendored
Normal file
2
.gitattributes
vendored
Normal file
|
@ -0,0 +1,2 @@
|
|||
# Declare files that will always have CRLF line endings on checkout.
|
||||
*.ps1 text eol=crlf
|
33
.github/ISSUE_TEMPLATE/bug_report.md
vendored
Normal file
33
.github/ISSUE_TEMPLATE/bug_report.md
vendored
Normal file
|
@ -0,0 +1,33 @@
|
|||
---
|
||||
name: Bug report
|
||||
about: Create a report to help us improve
|
||||
title: "[BUG]"
|
||||
labels: bug
|
||||
assignees: ''
|
||||
|
||||
---
|
||||
|
||||
- [ ] I am reporting a bug.
|
||||
- [ ] I am running the latest version of BDfR
|
||||
- [ ] I have read the [Opening an issue](https://github.com/aliparlakci/bulk-downloader-for-reddit/blob/master/docs/CONTRIBUTING.md#opening-an-issue)
|
||||
|
||||
## Description
|
||||
|
||||
A clear and concise description of what the bug is.
|
||||
|
||||
## Command
|
||||
|
||||
```text
|
||||
Paste here the command(s) that causes the bug
|
||||
```
|
||||
|
||||
## Environment (please complete the following information)
|
||||
|
||||
- OS: [e.g. Windows 10]
|
||||
- Python version: [e.g. 3.9.4]
|
||||
|
||||
## Logs
|
||||
|
||||
```text
|
||||
Paste the log output here.
|
||||
```
|
16
.github/ISSUE_TEMPLATE/feature_request.md
vendored
Normal file
16
.github/ISSUE_TEMPLATE/feature_request.md
vendored
Normal file
|
@ -0,0 +1,16 @@
|
|||
---
|
||||
name: Feature request
|
||||
about: Suggest an idea for this project
|
||||
title: "[FEATURE]"
|
||||
labels: ''
|
||||
assignees: ''
|
||||
|
||||
---
|
||||
|
||||
- [ ] I am requesting a feature.
|
||||
- [ ] I am running the latest version of BDfR
|
||||
- [ ] I have read the [Opening an issue](../../README.md#configuration)
|
||||
|
||||
## Description
|
||||
|
||||
Clearly state the current situation and issues you experience. Then, explain how this feature would solve these issues and make life easier. Also, explain the feature with as many detail as possible.
|
20
.github/ISSUE_TEMPLATE/site-support-request.md
vendored
Normal file
20
.github/ISSUE_TEMPLATE/site-support-request.md
vendored
Normal file
|
@ -0,0 +1,20 @@
|
|||
---
|
||||
name: Site Support request
|
||||
about: Describe this issue template's purpose here.
|
||||
title: "[SITE]"
|
||||
labels: ''
|
||||
assignees: ''
|
||||
|
||||
---
|
||||
|
||||
- [ ] I am requesting a site support.
|
||||
- [ ] I am running the latest version of BDfR
|
||||
- [ ] I have read the [Opening an issue](../../README.md#configuration)
|
||||
|
||||
## Site
|
||||
|
||||
Provide a URL to domain of the site.
|
||||
|
||||
## Example posts
|
||||
|
||||
Provide example reddit posts with the domain.
|
13
.github/workflows/formatting_check.yml
vendored
Normal file
13
.github/workflows/formatting_check.yml
vendored
Normal file
|
@ -0,0 +1,13 @@
|
|||
name: formatting_check
|
||||
run-name: Check code formatting
|
||||
on: [push, pull_request]
|
||||
jobs:
|
||||
formatting_check:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Install dependencies
|
||||
run: sudo gem install mdl
|
||||
- uses: actions/checkout@v3
|
||||
- uses: paolorechia/pox@v1.0.1
|
||||
with:
|
||||
tox_env: "format_check"
|
13
.github/workflows/protect_master.yml
vendored
Normal file
13
.github/workflows/protect_master.yml
vendored
Normal file
|
@ -0,0 +1,13 @@
|
|||
name: Protect master branch
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
branches:
|
||||
- master
|
||||
jobs:
|
||||
merge_check:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Check if the pull request is mergeable to master
|
||||
run: |
|
||||
if [[ "$GITHUB_HEAD_REF" == 'development' && "$GITHUB_REPOSITORY" == 'aliparlakci/bulk-downloader-for-reddit' ]]; then exit 0; else exit 1; fi;
|
35
.github/workflows/publish.yml
vendored
Normal file
35
.github/workflows/publish.yml
vendored
Normal file
|
@ -0,0 +1,35 @@
|
|||
# This workflow will upload a Python Package using Twine when a release is created
|
||||
# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
|
||||
|
||||
name: Upload Python Package
|
||||
|
||||
on:
|
||||
release:
|
||||
types: [created]
|
||||
|
||||
jobs:
|
||||
deploy:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: '3.9'
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install build setuptools wheel twine
|
||||
- name: Build and publish
|
||||
env:
|
||||
TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
|
||||
TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
|
||||
run: |
|
||||
python -m build
|
||||
twine upload dist/*
|
||||
|
||||
- name: Upload dist folder
|
||||
uses: actions/upload-artifact@v3
|
||||
with:
|
||||
name: dist
|
||||
path: dist/
|
59
.github/workflows/test.yml
vendored
Normal file
59
.github/workflows/test.yml
vendored
Normal file
|
@ -0,0 +1,59 @@
|
|||
name: Python Test
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [ master, development ]
|
||||
paths-ignore:
|
||||
- "**.md"
|
||||
- ".markdown_style.rb"
|
||||
- ".mdlrc"
|
||||
pull_request:
|
||||
branches: [ master, development ]
|
||||
paths-ignore:
|
||||
- "**.md"
|
||||
- ".markdown_style.rb"
|
||||
- ".mdlrc"
|
||||
|
||||
jobs:
|
||||
test:
|
||||
runs-on: ${{ matrix.os }}
|
||||
strategy:
|
||||
matrix:
|
||||
os: [ubuntu-latest, macos-latest]
|
||||
python-version: [3.9]
|
||||
ext: [.sh]
|
||||
include:
|
||||
- os: windows-latest
|
||||
python-version: 3.9
|
||||
ext: .ps1
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- name: Setup Python
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: ${{ matrix.python-version }}
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip Flake8-pyproject pytest pytest-cov
|
||||
pip install .
|
||||
|
||||
- name: Make configuration for tests
|
||||
env:
|
||||
REDDIT_TOKEN: ${{ secrets.REDDIT_TEST_TOKEN }}
|
||||
run: |
|
||||
./devscripts/configure${{ matrix.ext }}
|
||||
|
||||
- name: Lint with flake8
|
||||
run: |
|
||||
flake8 . --select=E9,F63,F7,F82
|
||||
|
||||
- name: Test with pytest
|
||||
run: |
|
||||
pytest -m 'not slow' --verbose --cov=./bdfr/ --cov-report term:skip-covered --cov-report html
|
||||
|
||||
- name: Upload coverage report
|
||||
uses: actions/upload-artifact@v3
|
||||
with:
|
||||
name: coverage_report
|
||||
path: htmlcov/
|
150
.gitignore
vendored
150
.gitignore
vendored
|
@ -1,6 +1,144 @@
|
|||
build/
|
||||
dist/
|
||||
MANIFEST
|
||||
__pycache__/
|
||||
src/__pycache__/
|
||||
config.json
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
|
||||
# C extensions
|
||||
*.so
|
||||
|
||||
# Distribution / packaging
|
||||
.Python
|
||||
build/
|
||||
develop-eggs/
|
||||
dist/
|
||||
downloads/
|
||||
eggs/
|
||||
.eggs/
|
||||
lib/
|
||||
lib64/
|
||||
parts/
|
||||
sdist/
|
||||
var/
|
||||
wheels/
|
||||
share/python-wheels/
|
||||
*.egg-info/
|
||||
.installed.cfg
|
||||
*.egg
|
||||
MANIFEST
|
||||
|
||||
# PyInstaller
|
||||
# Usually these files are written by a python script from a template
|
||||
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||
*.manifest
|
||||
*.spec
|
||||
|
||||
# Installer logs
|
||||
pip-log.txt
|
||||
pip-delete-this-directory.txt
|
||||
|
||||
# Unit test / coverage reports
|
||||
htmlcov/
|
||||
.tox/
|
||||
.nox/
|
||||
.coverage
|
||||
.coverage.*
|
||||
.cache
|
||||
nosetests.xml
|
||||
coverage.xml
|
||||
*.cover
|
||||
*.py,cover
|
||||
.hypothesis/
|
||||
.pytest_cache/
|
||||
cover/
|
||||
|
||||
# Translations
|
||||
*.mo
|
||||
*.pot
|
||||
|
||||
# Django stuff:
|
||||
*.log
|
||||
local_settings.py
|
||||
db.sqlite3
|
||||
db.sqlite3-journal
|
||||
|
||||
# Flask stuff:
|
||||
instance/
|
||||
.webassets-cache
|
||||
|
||||
# Scrapy stuff:
|
||||
.scrapy
|
||||
|
||||
# Sphinx documentation
|
||||
docs/_build/
|
||||
|
||||
# PyBuilder
|
||||
.pybuilder/
|
||||
target/
|
||||
|
||||
# Jupyter Notebook
|
||||
.ipynb_checkpoints
|
||||
|
||||
# IPython
|
||||
profile_default/
|
||||
ipython_config.py
|
||||
|
||||
# pyenv
|
||||
# For a library or package, you might want to ignore these files since the code is
|
||||
# intended to run in multiple environments; otherwise, check them in:
|
||||
# .python-version
|
||||
|
||||
# pipenv
|
||||
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
||||
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
||||
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
||||
# install all needed dependencies.
|
||||
#Pipfile.lock
|
||||
|
||||
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
|
||||
__pypackages__/
|
||||
|
||||
# Celery stuff
|
||||
celerybeat-schedule
|
||||
celerybeat.pid
|
||||
|
||||
# SageMath parsed files
|
||||
*.sage.py
|
||||
|
||||
# Environments
|
||||
.env
|
||||
.venv
|
||||
env/
|
||||
venv/
|
||||
ENV/
|
||||
env.bak/
|
||||
venv.bak/
|
||||
|
||||
# Spyder project settings
|
||||
.spyderproject
|
||||
.spyproject
|
||||
|
||||
# Rope project settings
|
||||
.ropeproject
|
||||
|
||||
# mkdocs documentation
|
||||
/site
|
||||
|
||||
# mypy
|
||||
.mypy_cache/
|
||||
.dmypy.json
|
||||
dmypy.json
|
||||
|
||||
# Pyre type checker
|
||||
.pyre/
|
||||
|
||||
# pytype static type analyzer
|
||||
.pytype/
|
||||
|
||||
# Cython debug symbols
|
||||
cython_debug/
|
||||
|
||||
# Test configuration file
|
||||
test_config.cfg
|
||||
|
||||
.vscode/
|
||||
.idea/
|
9
.gitmodules
vendored
Normal file
9
.gitmodules
vendored
Normal file
|
@ -0,0 +1,9 @@
|
|||
[submodule "scripts/tests/bats"]
|
||||
path = scripts/tests/bats
|
||||
url = https://github.com/bats-core/bats-core.git
|
||||
[submodule "scripts/tests/test_helper/bats-assert"]
|
||||
path = scripts/tests/test_helper/bats-assert
|
||||
url = https://github.com/bats-core/bats-assert.git
|
||||
[submodule "scripts/tests/test_helper/bats-support"]
|
||||
path = scripts/tests/test_helper/bats-support
|
||||
url = https://github.com/bats-core/bats-support.git
|
4
.markdown_style.rb
Normal file
4
.markdown_style.rb
Normal file
|
@ -0,0 +1,4 @@
|
|||
all
|
||||
exclude_tag :line_length
|
||||
rule 'MD007', :indent => 4
|
||||
rule 'MD029', :style => 'ordered'
|
25
.pre-commit-config.yaml
Normal file
25
.pre-commit-config.yaml
Normal file
|
@ -0,0 +1,25 @@
|
|||
# See https://pre-commit.com for more information
|
||||
# See https://pre-commit.com/hooks.html for more hooks
|
||||
|
||||
repos:
|
||||
- repo: https://github.com/psf/black
|
||||
rev: 22.12.0
|
||||
hooks:
|
||||
- id: black
|
||||
|
||||
- repo: https://github.com/pycqa/isort
|
||||
rev: 5.11.4
|
||||
hooks:
|
||||
- id: isort
|
||||
name: isort (python)
|
||||
|
||||
- repo: https://github.com/pycqa/flake8
|
||||
rev: 6.0.0
|
||||
hooks:
|
||||
- id: flake8
|
||||
additional_dependencies: [Flake8-pyproject]
|
||||
|
||||
- repo: https://github.com/markdownlint/markdownlint
|
||||
rev: v0.12.0
|
||||
hooks:
|
||||
- id: markdownlint
|
495
README.md
495
README.md
|
@ -1,29 +1,466 @@
|
|||
# Bulk Downloader for Reddit
|
||||
Downloads media from reddit posts.
|
||||
|
||||
## [Download the latest release](https://github.com/aliparlakci/bulk-downloader-for-reddit/releases/latest)
|
||||
|
||||
## What it can do
|
||||
- Can get posts from: frontpage, subreddits, multireddits, redditor's submissions, upvoted and saved posts; search results or just plain reddit links
|
||||
- Sorts posts by hot, top, new and so on
|
||||
- Downloads **REDDIT** images and videos, **IMGUR** images and albums, **GFYCAT** links, **EROME** images and albums, **SELF POSTS** and any link to a **DIRECT IMAGE**
|
||||
- Skips the existing ones
|
||||
- Puts post title and OP's name in file's name
|
||||
- Puts every post to its subreddit's folder
|
||||
- Saves a reusable copy of posts' details that are found so that they can be re-downloaded again
|
||||
- Logs failed ones in a file to so that you can try to download them later
|
||||
|
||||
## **[Compiling it from source code](docs/COMPILE_FROM_SOURCE.md)**
|
||||
*\* MacOS users have to use this option.*
|
||||
|
||||
## Additional options
|
||||
Script also accepts additional options via command-line arguments. Get further information from **[`--help`](docs/COMMAND_LINE_ARGUMENTS.md)**
|
||||
|
||||
## Setting up the script
|
||||
You need to create an imgur developer app in order API to work. Go to https://api.imgur.com/oauth2/addclient and fill the form (It does not really matter how you fill it).
|
||||
|
||||
It should redirect you to a page where it shows your **imgur_client_id** and **imgur_client_secret**.
|
||||
|
||||
## [FAQ](docs/FAQ.md)
|
||||
|
||||
## [Changes on *master*](docs/CHANGELOG.md)
|
||||
# Bulk Downloader for Reddit
|
||||
|
||||
[![PyPI Status](https://img.shields.io/pypi/status/bdfr?logo=PyPI)](https://pypi.python.org/pypi/bdfr)
|
||||
[![PyPI version](https://img.shields.io/pypi/v/bdfr.svg?logo=PyPI)](https://pypi.python.org/pypi/bdfr)
|
||||
[![PyPI downloads](https://img.shields.io/pypi/dm/bdfr?logo=PyPI)](https://pypi.python.org/pypi/bdfr)
|
||||
[![AUR version](https://img.shields.io/aur/version/python-bdfr?logo=Arch%20Linux)](https://aur.archlinux.org/packages/python-bdfr)
|
||||
[![Python Test](https://github.com/aliparlakci/bulk-downloader-for-reddit/actions/workflows/test.yml/badge.svg?branch=master)](https://github.com/aliparlakci/bulk-downloader-for-reddit/actions/workflows/test.yml)
|
||||
[![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg?logo=Python)](https://github.com/psf/black)
|
||||
[![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit)](https://github.com/pre-commit/pre-commit)
|
||||
|
||||
This is a tool to download submissions or submission data from Reddit. It can be used to archive data or even crawl Reddit to gather research data. The BDFR is flexible and can be used in scripts if needed through an extensive command-line interface. [List of currently supported sources](#list-of-currently-supported-sources)
|
||||
|
||||
If you wish to open an issue, please read [the guide on opening issues](docs/CONTRIBUTING.md#opening-an-issue) to ensure that your issue is clear and contains everything it needs to for the developers to investigate.
|
||||
|
||||
Included in this README are a few example Bash tricks to get certain behaviour. For that, see [Common Command Tricks](#common-command-tricks).
|
||||
|
||||
## Installation
|
||||
|
||||
*Bulk Downloader for Reddit* needs Python version 3.9 or above. Please update Python before installation to meet the requirement.
|
||||
|
||||
Then, you can install it via pip with:
|
||||
|
||||
```bash
|
||||
python3 -m pip install bdfr --upgrade
|
||||
```
|
||||
|
||||
or via [pipx](https://pypa.github.io/pipx) with:
|
||||
|
||||
```bash
|
||||
python3 -m pipx install bdfr
|
||||
```
|
||||
|
||||
**To update BDFR**, run the above command again for pip or `pipx upgrade bdfr` for pipx installations.
|
||||
|
||||
**To check your version of BDFR**, run `bdfr --version`
|
||||
|
||||
**To install shell completions**, run `bdfr completions`
|
||||
|
||||
### AUR Package
|
||||
|
||||
If on Arch Linux or derivative operating systems such as Manjaro, the BDFR can be installed through the AUR.
|
||||
|
||||
- Latest Release: <https://aur.archlinux.org/packages/python-bdfr>
|
||||
- Latest Development Build: <https://aur.archlinux.org/packages/python-bdfr-git>
|
||||
|
||||
### Source code
|
||||
|
||||
If you want to use the source code or make contributions, refer to [CONTRIBUTING](docs/CONTRIBUTING.md#preparing-the-environment-for-development)
|
||||
|
||||
## Usage
|
||||
|
||||
The BDFR works by taking submissions from a variety of "sources" from Reddit and then parsing them to download. These sources might be a subreddit, multireddit, a user list, or individual links. These sources are combined and downloaded to disk, according to a naming and organisational scheme defined by the user.
|
||||
|
||||
There are three modes to the BDFR: download, archive, and clone. Each one has a command that performs similar but distinct functions. The `download` command will download the resource linked in the Reddit submission, such as the images, video, etc. The `archive` command will download the submission data itself and store it, such as the submission details, upvotes, text, statistics, as and all the comments on that submission. These can then be saved in a data markup language form, such as JSON, XML, or YAML. Lastly, the `clone` command will perform both functions of the previous commands at once and is more efficient than running those commands sequentially.
|
||||
|
||||
Note that the `clone` command is not a true, failthful clone of Reddit. It simply retrieves much of the raw data that Reddit provides. To get a true clone of Reddit, another tool such as HTTrack should be used.
|
||||
|
||||
After installation, run the program from any directory as shown below:
|
||||
|
||||
```bash
|
||||
bdfr download
|
||||
```
|
||||
|
||||
```bash
|
||||
bdfr archive
|
||||
```
|
||||
|
||||
```bash
|
||||
bdfr clone
|
||||
```
|
||||
|
||||
However, these commands are not enough. You should chain parameters in [Options](#options) according to your use case. Don't forget that some parameters can be provided multiple times. Some quick reference commands are:
|
||||
|
||||
```bash
|
||||
bdfr download ./path/to/output --subreddit Python -L 10
|
||||
```
|
||||
|
||||
```bash
|
||||
bdfr download ./path/to/output --user reddituser --submitted -L 100
|
||||
```
|
||||
|
||||
```bash
|
||||
bdfr download ./path/to/output --user me --saved --authenticate -L 25 --file-scheme '{POSTID}'
|
||||
```
|
||||
|
||||
```bash
|
||||
bdfr download ./path/to/output --subreddit 'Python, all, mindustry' -L 10 --make-hard-links
|
||||
```
|
||||
|
||||
```bash
|
||||
bdfr archive ./path/to/output --user reddituser --submitted --all-comments --comment-context
|
||||
```
|
||||
|
||||
```bash
|
||||
bdfr archive ./path/to/output --subreddit all --format yaml -L 500 --folder-scheme ''
|
||||
```
|
||||
|
||||
Alternatively, you can pass options through a YAML file.
|
||||
|
||||
```bash
|
||||
bdfr download ./path/to/output --opts my_opts.yaml
|
||||
```
|
||||
|
||||
For example, running it with the following file
|
||||
|
||||
```yaml
|
||||
skip: [mp4, avi]
|
||||
file_scheme: "{UPVOTES}_{REDDITOR}_{POSTID}_{DATE}"
|
||||
limit: 10
|
||||
sort: top
|
||||
subreddit:
|
||||
- EarthPorn
|
||||
- CityPorn
|
||||
```
|
||||
|
||||
would be equilavent to (take note that in YAML there is `file_scheme` instead of `file-scheme`):
|
||||
|
||||
```bash
|
||||
bdfr download ./path/to/output --skip mp4 --skip avi --file-scheme "{UPVOTES}_{REDDITOR}_{POSTID}_{DATE}" -L 10 -S top --subreddit EarthPorn --subreddit CityPorn
|
||||
```
|
||||
|
||||
Any option that can be specified multiple times should be formatted like subreddit is above.
|
||||
|
||||
In case when the same option is specified both in the YAML file and in as a command line argument, the command line argument takes priority
|
||||
|
||||
## Options
|
||||
|
||||
The following options are common between both the `archive` and `download` commands of the BDFR.
|
||||
|
||||
- `directory`
|
||||
- This is the directory to which the BDFR will download and place all files
|
||||
- `--authenticate`
|
||||
- This flag will make the BDFR attempt to use an authenticated Reddit session
|
||||
- See [Authentication](#authentication-and-security) for more details
|
||||
- `--config`
|
||||
- If the path to a configuration file is supplied with this option, the BDFR will use the specified config
|
||||
- See [Configuration Files](#configuration) for more details
|
||||
- `--opts`
|
||||
- Load options from a YAML file.
|
||||
- Has higher prority than the global config file but lower than command-line arguments.
|
||||
- See [opts_example.yaml](./opts_example.yaml) for an example file.
|
||||
- `--disable-module`
|
||||
- Can be specified multiple times
|
||||
- Disables certain modules from being used
|
||||
- See [Disabling Modules](#disabling-modules) for more information and a list of module names
|
||||
- `--filename-restriction-scheme`
|
||||
- Can be: `windows`, `linux`
|
||||
- Turns off the OS detection and specifies which system to use when making filenames
|
||||
- See [Filesystem Restrictions](#filesystem-restrictions)
|
||||
- `--ignore-user`
|
||||
- This will add a user to ignore
|
||||
- Can be specified multiple times
|
||||
- `--include-id-file`
|
||||
- This will add any submission with the IDs in the files provided
|
||||
- Can be specified multiple times
|
||||
- Format is one ID per line
|
||||
- `--log`
|
||||
- This allows one to specify the location of the logfile
|
||||
- This must be done when running multiple instances of the BDFR, see [Multiple Instances](#multiple-instances) below
|
||||
- `--saved`
|
||||
- This option will make the BDFR use the supplied user's saved posts list as a download source
|
||||
- This requires an authenticated Reddit instance, using the `--authenticate` flag, as well as `--user` set to `me`
|
||||
- `--search`
|
||||
- This will apply the input search term to specific lists when scraping submissions
|
||||
- A search term can only be applied when using the `--subreddit` and `--multireddit` flags
|
||||
- `--submitted`
|
||||
- This will use a user's submissions as a source
|
||||
- A user must be specified with `--user`
|
||||
- `--upvoted`
|
||||
- This will use a user's upvoted posts as a source of posts to scrape
|
||||
- This requires an authenticated Reddit instance, using the `--authenticate` flag, as well as `--user` set to `me`
|
||||
- `-L, --limit`
|
||||
- This is the limit on the number of submissions retrieve
|
||||
- Default is max possible
|
||||
- Note that this limit applies to **each source individually** e.g. if a `--limit` of 10 and three subreddits are provided, then 30 total submissions will be scraped
|
||||
- If it is not supplied, then the BDFR will default to the maximum allowed by Reddit, roughly 1000 posts. **We cannot bypass this.**
|
||||
- `-S, --sort`
|
||||
- This is the sort type for each applicable submission source supplied to the BDFR
|
||||
- This option does not apply to upvoted or saved posts when scraping from these sources
|
||||
- The following options are available:
|
||||
- `controversial`
|
||||
- `hot` (default)
|
||||
- `new`
|
||||
- `relevance` (only available when using `--search`)
|
||||
- `rising`
|
||||
- `top`
|
||||
- `-l, --link`
|
||||
- This is a direct link to a submission to download, either as a URL or an ID
|
||||
- Can be specified multiple times
|
||||
- `-m, --multireddit`
|
||||
- This is the name of a multireddit to add as a source
|
||||
- Can be specified multiple times
|
||||
- This can be done by using `-m` multiple times
|
||||
- Multireddits can also be used to provide CSV multireddits e.g. `-m 'chess, favourites'`
|
||||
- The specified multireddits must all belong to the user specified with the `--user` option
|
||||
- `-s, --subreddit`
|
||||
- This adds a subreddit as a source
|
||||
- Can be used mutliple times
|
||||
- This can be done by using `-s` multiple times
|
||||
- Subreddits can also be used to provide CSV subreddits e.g. `-m 'all, python, mindustry'`
|
||||
- `-t, --time`
|
||||
- This is the time filter that will be applied to all applicable sources
|
||||
- This option does not apply to upvoted or saved posts when scraping from these sources
|
||||
- This option only applies if sorting by top or controversial. See --sort for more detail.
|
||||
- The following options are available:
|
||||
- `all` (default)
|
||||
- `hour`
|
||||
- `day`
|
||||
- `week`
|
||||
- `month`
|
||||
- `year`
|
||||
- `--time-format`
|
||||
- This specifies the format of the datetime string that replaces `{DATE}` in file and folder naming schemes
|
||||
- See [Time Formatting Customisation](#time-formatting-customisation) for more details, and the formatting scheme
|
||||
- `-u, --user`
|
||||
- This specifies the user to scrape in concert with other options
|
||||
- When using `--authenticate`, `--user me` can be used to refer to the authenticated user
|
||||
- Can be specified multiple times for multiple users
|
||||
- If downloading a multireddit, only one user can be specified
|
||||
- `-v, --verbose`
|
||||
- Increases the verbosity of the program
|
||||
- Can be specified multiple times
|
||||
|
||||
### Downloader Options
|
||||
|
||||
The following options apply only to the `download` command. This command downloads the files and resources linked to in the submission, or a text submission itself, to the disk in the specified directory.
|
||||
|
||||
- `--make-hard-links`
|
||||
- This flag will create hard links to an existing file when a duplicate is downloaded in the current run
|
||||
- This will make the file appear in multiple directories while only taking the space of a single instance
|
||||
- `--max-wait-time`
|
||||
- This option specifies the maximum wait time for downloading a resource
|
||||
- The default is 120 seconds
|
||||
- See [Rate Limiting](#rate-limiting) for details
|
||||
- `--no-dupes`
|
||||
- This flag will not redownload files if they were already downloaded in the current run
|
||||
- This is calculated by MD5 hash
|
||||
- `--search-existing`
|
||||
- This will make the BDFR compile the hashes for every file in `directory`
|
||||
- The hashes are used to remove duplicates if `--no-dupes` is supplied or make hard links if `--make-hard-links` is supplied
|
||||
- `--file-scheme`
|
||||
- Sets the scheme for files
|
||||
- Default is `{REDDITOR}_{TITLE}_{POSTID}`
|
||||
- See [Folder and File Name Schemes](#folder-and-file-name-schemes) for more details
|
||||
- `--folder-scheme`
|
||||
- Sets the scheme for folders
|
||||
- Default is `{SUBREDDIT}`
|
||||
- See [Folder and File Name Schemes](#folder-and-file-name-schemes) for more details
|
||||
- `--exclude-id`
|
||||
- This will skip the download of any submission with the ID provided
|
||||
- Can be specified multiple times
|
||||
- `--exclude-id-file`
|
||||
- This will skip the download of any submission with any of the IDs in the files provided
|
||||
- Can be specified multiple times
|
||||
- Format is one ID per line
|
||||
- `--skip-domain`
|
||||
- This adds domains to the download filter i.e. submissions coming from these domains will not be downloaded
|
||||
- Can be specified multiple times
|
||||
- Domains must be supplied in the form `example.com` or `img.example.com`
|
||||
- `--skip`
|
||||
- This adds file types to the download filter i.e. submissions with one of the supplied file extensions will not be downloaded
|
||||
- Can be specified multiple times
|
||||
- `--skip-subreddit`
|
||||
- This skips all submissions from the specified subreddit
|
||||
- Can be specified multiple times
|
||||
- Also accepts CSV subreddit names
|
||||
- `--min-score`
|
||||
- This skips all submissions which have fewer than specified upvotes
|
||||
- `--max-score`
|
||||
- This skips all submissions which have more than specified upvotes
|
||||
- `--min-score-ratio`
|
||||
- This skips all submissions which have lower than specified upvote ratio
|
||||
- `--max-score-ratio`
|
||||
- This skips all submissions which have higher than specified upvote ratio
|
||||
|
||||
### Archiver Options
|
||||
|
||||
The following options are for the `archive` command specifically.
|
||||
|
||||
- `--all-comments`
|
||||
- When combined with the `--user` option, this will download all the user's comments
|
||||
- `-f, --format`
|
||||
- This specifies the format of the data file saved to disk
|
||||
- The following formats are available:
|
||||
- `json` (default)
|
||||
- `xml`
|
||||
- `yaml`
|
||||
- `--comment-context`
|
||||
- This option will, instead of downloading an individual comment, download the submission that comment is a part of
|
||||
- May result in a longer run time as it retrieves much more data
|
||||
|
||||
### Cloner Options
|
||||
|
||||
The `clone` command can take all the options listed above for both the `archive` and `download` commands since it performs the functions of both.
|
||||
|
||||
## Common Command Tricks
|
||||
|
||||
A common use case is for subreddits/users to be loaded from a file. The BDFR supports this via YAML file options (`--opts my_opts.yaml`).
|
||||
|
||||
Alternatively, you can use the command-line [xargs](https://en.wikipedia.org/wiki/Xargs) function.
|
||||
For a list of users `users.txt` (one user per line), type:
|
||||
|
||||
```bash
|
||||
cat users.txt | xargs -L 1 echo --user | xargs -L 50 bdfr download <ARGS>
|
||||
```
|
||||
|
||||
The part `-L 50` is to make sure that the character limit for a single line isn't exceeded, but may not be necessary. This can also be used to load subreddits from a file, simply exchange `--user` with `--subreddit` and so on.
|
||||
|
||||
## Authentication and Security
|
||||
|
||||
The BDFR uses OAuth2 authentication to connect to Reddit if authentication is required. This means that it is a secure, token-based system for making requests. This also means that the BDFR only has access to specific parts of the account authenticated, by default only saved posts, upvoted posts, and the identity of the authenticated account. Note that authentication is not required unless accessing private things like upvoted posts, saved posts, and private multireddits.
|
||||
|
||||
To authenticate, the BDFR will first look for a token in the configuration file that signals that there's been a previous authentication. If this is not there, then the BDFR will attempt to register itself with your account. This is normal, and if you run the program, it will pause and show a Reddit URL. Click on this URL and it will take you to Reddit, where the permissions being requested will be shown. Read this and **confirm that there are no more permissions than needed to run the program**. You should not grant unneeded permissions; by default, the BDFR only requests permission to read your saved or upvoted submissions and identify as you.
|
||||
|
||||
If the permissions look safe, confirm it, and the BDFR will save a token that will allow it to authenticate with Reddit from then on.
|
||||
|
||||
## Changing Permissions
|
||||
|
||||
Most users will not need to do anything extra to use any of the current features. However, if additional features such as scraping messages, PMs, etc are added in the future, these will require additional scopes. Additionally, advanced users may wish to use the BDFR with their own API key and secret. There is normally no need to do this, but it *is* allowed by the BDFR.
|
||||
|
||||
The configuration file for the BDFR contains the API secret and key, as well as the scopes that the BDFR will request when registering itself to a Reddit account via OAuth2. These can all be changed if the user wishes, however do not do so if you don't know what you are doing. The defaults are specifically chosen to have a very low security risk if your token were to be compromised, however unlikely that actually is. Never grant more permissions than you absolutely need.
|
||||
|
||||
For more details on the configuration file and the values therein, see [Configuration Files](#configuration).
|
||||
|
||||
## Folder and File Name Schemes
|
||||
|
||||
The naming and folder schemes for the BDFR are both completely customisable. A number of different fields can be given which will be replaced with properties from a submission when downloading it. The scheme format takes the form of `{KEY}`, where `KEY` is a string from the below list.
|
||||
|
||||
- `DATE`
|
||||
- `FLAIR`
|
||||
- `POSTID`
|
||||
- `REDDITOR`
|
||||
- `SUBREDDIT`
|
||||
- `TITLE`
|
||||
- `UPVOTES`
|
||||
|
||||
Each of these can be enclosed in curly bracket, `{}`, and included in the name. For example, to just title every downloaded post with the unique submission ID, you can use `{POSTID}`. Static strings can also be included, such as `download_{POSTID}` which will not change from submission to submission. For example, the previous string will result in the following submission file names:
|
||||
|
||||
- `download_aaaaaa.png`
|
||||
- `download_bbbbbb.png`
|
||||
|
||||
At least one key *must* be included in the file scheme, otherwise an error will be thrown. The folder scheme however, can be null or a simple static string. In the former case, all files will be placed in the folder specified with the `directory` argument. If the folder scheme is a static string, then all submissions will be placed in a folder of that name. In both cases, there will be no separation between all submissions.
|
||||
|
||||
It is highly recommended that the file name scheme contain the parameter `{POSTID}` as this is **the only parameter guaranteed to be unique**. No combination of other keys will necessarily be unique and may result in posts being skipped as the BDFR will see files by the same name and skip the download, assuming that they are already downloaded.
|
||||
|
||||
## Configuration
|
||||
|
||||
The configuration files are, by default, stored in the configuration directory for the user. This differs depending on the OS that the BDFR is being run on. For Windows, this will be:
|
||||
|
||||
- `C:\Users\<User>\AppData\Local\BDFR\bdfr`
|
||||
|
||||
If Python has been installed through the Windows Store, the folder will appear in a different place. Note that the hash included in the file path may change from installation to installation.
|
||||
|
||||
- `C:\Users\<User>\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\Local\BDFR\bdfr`
|
||||
|
||||
On Mac OSX, this will be:
|
||||
|
||||
- `~/Library/Application Support/bdfr`.
|
||||
|
||||
Lastly, on a Linux system, this will be:
|
||||
|
||||
- `~/.config/bdfr/`
|
||||
|
||||
The logging output for each run of the BDFR will be saved to this directory in the file `log_output.txt`. If you need to submit a bug, it is this file that you will need to submit with the report.
|
||||
|
||||
### Configuration File
|
||||
|
||||
The `config.cfg` is the file that supplies the BDFR with the configuration to use. At the moment, the following keys **must** be included in the configuration file supplied.
|
||||
|
||||
- `client_id`
|
||||
- `client_secret`
|
||||
- `scopes`
|
||||
|
||||
The following keys are optional, and defaults will be used if they cannot be found.
|
||||
|
||||
- `backup_log_count`
|
||||
- `max_wait_time`
|
||||
- `time_format`
|
||||
- `disabled_modules`
|
||||
- `filename-restriction-scheme`
|
||||
|
||||
All of these should not be modified unless you know what you're doing, as the default values will enable the BDFR to function just fine. A configuration is included in the BDFR when it is installed, and this will be placed in the configuration directory as the default.
|
||||
|
||||
Most of these values have to do with OAuth2 configuration and authorisation. The key `backup_log_count` however has to do with the log rollover. The logs in the configuration directory can be verbose and for long runs of the BDFR, can grow quite large. To combat this, the BDFR will overwrite previous logs. This value determines how many previous run logs will be kept. The default is 3, which means that the BDFR will keep at most three past logs plus the current one. Any runs past this will overwrite the oldest log file, called "rolling over". If you want more records of past runs, increase this number.
|
||||
|
||||
#### Time Formatting Customisation
|
||||
|
||||
The option `time_format` will specify the format of the timestamp that replaces `{DATE}` in filename and folder name schemes. By default, this is the [ISO 8601](https://en.wikipedia.org/wiki/ISO_8601) format which is highly recommended due to its standardised nature. If you don't **need** to change it, it is recommended that you do not. However, you can specify it to anything required with this option. The `--time-format` option supersedes any specification in the configuration file
|
||||
|
||||
The format can be specified through the [format codes](https://docs.python.org/3/library/datetime.html#strftime-strptime-behavior) that are standard in the Python `datetime` library.
|
||||
|
||||
#### Disabling Modules
|
||||
|
||||
The individual modules of the BDFR, used to download submissions from websites, can be disabled. This is helpful especially in the case of the fallback downloaders, since the `--skip-domain` option cannot be effectively used in these cases. For example, the Youtube-DL downloader can retrieve data from hundreds of websites and domains; thus the only way to fully disable it is via the `--disable-module` option.
|
||||
|
||||
Modules can be disabled through the command line interface for the BDFR or more permanently in the configuration file via the `disabled_modules` option. The list of downloaders that can be disabled are the following. Note that they are case-insensitive.
|
||||
|
||||
- `Direct`
|
||||
- `DelayForReddit`
|
||||
- `Erome`
|
||||
- `Gallery` (Reddit Image Galleries)
|
||||
- `Gfycat`
|
||||
- `Imgur`
|
||||
- `PornHub`
|
||||
- `Redgifs`
|
||||
- `SelfPost` (Reddit Text Post)
|
||||
- `Vidble`
|
||||
- `VReddit` (Reddit Video Post)
|
||||
- `Youtube`
|
||||
- `YoutubeDlFallback`
|
||||
|
||||
### Rate Limiting
|
||||
|
||||
The option `max_wait_time` has to do with retrying downloads. There are certain HTTP errors that mean that no amount of requests will return the wanted data, but some errors are from rate-limiting. This is when a single client is making so many requests that the remote website cuts the client off to preserve the function of the site. This is a common situation when downloading many resources from the same site. It is polite and best practice to obey the website's wishes in these cases.
|
||||
|
||||
To this end, the BDFR will sleep for a time before retrying the download, giving the remote server time to "rest". This is done in 60 second increments. For example, if a rate-limiting-related error is given, the BDFR will sleep for 60 seconds before retrying. Then, if the same type of error occurs, it will sleep for another 120 seconds, then 180 seconds, and so on.
|
||||
|
||||
The option `--max-wait-time` and the configuration option `max_wait_time` both specify the maximum time the BDFR will wait. If both are present, the command-line option takes precedence. For instance, the default is 120, so the BDFR will wait for 60 seconds, then 120 seconds, and then move one. **Note that this results in a total time of 180 seconds trying the same download**. If you wish to try to bypass the rate-limiting system on the remote site, increasing the maximum wait time may help. However, note that the actual wait times increase exponentially if the resource is not downloaded i.e. specifying a max value of 300 (5 minutes), can make the BDFR pause for 15 minutes on one submission, not 5, in the worst case.
|
||||
|
||||
## Multiple Instances
|
||||
|
||||
The BDFR can be run in multiple instances with multiple configurations, either concurrently or consecutively. The use of scripting files facilitates this the easiest, either Powershell on Windows operating systems or Bash elsewhere. This allows multiple scenarios to be run with data being scraped from different sources, as any two sets of scenarios might be mutually exclusive i.e. it is not possible to download any combination of data from a single run of the BDFR. To download from multiple users for example, multiple runs of the BDFR are required.
|
||||
|
||||
Running these scenarios consecutively is done easily, like any single run. Configuration files that differ may be specified with the `--config` option to switch between tokens, for example. Otherwise, almost all configuration for data sources can be specified per-run through the command line.
|
||||
|
||||
Running scenarios concurrently (at the same time) however, is more complicated. The BDFR will look to a single, static place to put the detailed log files, in a directory with the configuration file specified above. If there are multiple instances, or processes, of the BDFR running at the same time, they will all be trying to write to a single file. On Linux and other UNIX based operating systems, this will succeed, though there is a substantial risk that the logfile will be useless due to garbled and jumbled data. On Windows however, attempting this will raise an error that crashes the program as Windows forbids multiple processes from accessing the same file.
|
||||
|
||||
The way to fix this is to use the `--log` option to manually specify where the logfile is to be stored. If the given location is unique to each instance of the BDFR, then it will run fine.
|
||||
|
||||
## Filesystem Restrictions
|
||||
|
||||
Different filesystems have different restrictions for what files and directories can be named. Thesse are separated into two broad categories: Linux-based filesystems, which have very few restrictions; and Windows-based filesystems, which are much more restrictive in terms if forbidden characters and length of paths.
|
||||
|
||||
During the normal course of operation, the BDFR detects what filesystem it is running on and formats any filenames and directories to conform to the rules that are expected of it. However, there are cases where this will fail. When running on a Linux-based machine, or another system where the home filesystem is permissive, and accessing a share or drive with a less permissive system, the BDFR will assume that the *home* filesystem's rules apply. For example, when downloading to a SAMBA share from Ubuntu, there will be errors as SAMBA is more restrictive than Ubuntu.
|
||||
|
||||
The best option would be to always download to a filesystem that is as permission as possible, such as an NFS share or ext4 drive. However, when this is not possible, the BDFR allows for the restriction scheme to be manually specified at either the command-line or in the configuration file. At the command-line, this is done with `--filename-restriction-scheme windows`, or else an option by the same name in the configuration file.
|
||||
|
||||
## Manipulating Logfiles
|
||||
|
||||
The logfiles that the BDFR outputs are consistent and quite detailed and in a format that is amenable to regex. To this end, a number of bash scripts have been [included here](./scripts). They show examples for how to extract successfully downloaded IDs, failed IDs, and more besides.
|
||||
|
||||
## Unsaving posts
|
||||
|
||||
Back in v1 there was an option to unsave posts from your account when downloading, but it was removed from the core BDFR on v2 as it is considered a read-only tool. However, for those missing this functionality, a script was created that uses the log files to achieve this. There is info on how to use this on the README.md file on the scripts subdirectory.
|
||||
|
||||
## List of currently supported sources
|
||||
|
||||
- Direct links (links leading to a file)
|
||||
- Delay for Reddit
|
||||
- Erome
|
||||
- Gfycat
|
||||
- Gif Delivery Network
|
||||
- Imgur
|
||||
- Reddit Galleries
|
||||
- Reddit Text Posts
|
||||
- Reddit Videos
|
||||
- Redgifs
|
||||
- Vidble
|
||||
- YouTube
|
||||
- Any source supported by [YT-DLP](https://github.com/yt-dlp/yt-dlp/blob/master/supportedsites.md) should be compatable
|
||||
|
||||
## Contributing
|
||||
|
||||
If you wish to contribute, see [Contributing](docs/CONTRIBUTING.md) for more information.
|
||||
|
||||
When reporting any issues or interacting with the developers, please follow the [Code of Conduct](docs/CODE_OF_CONDUCT.md).
|
||||
|
|
|
@ -1 +1 @@
|
|||
theme: jekyll-theme-cayman
|
||||
theme: jekyll-theme-leap-day
|
4
bdfr/__init__.py
Normal file
4
bdfr/__init__.py
Normal file
|
@ -0,0 +1,4 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
__version__ = "2.6.2"
|
222
bdfr/__main__.py
Normal file
222
bdfr/__main__.py
Normal file
|
@ -0,0 +1,222 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import logging
|
||||
import sys
|
||||
|
||||
import click
|
||||
import requests
|
||||
|
||||
from bdfr import __version__
|
||||
from bdfr.archiver import Archiver
|
||||
from bdfr.cloner import RedditCloner
|
||||
from bdfr.completion import Completion
|
||||
from bdfr.configuration import Configuration
|
||||
from bdfr.downloader import RedditDownloader
|
||||
|
||||
logger = logging.getLogger()
|
||||
|
||||
_common_options = [
|
||||
click.argument("directory", type=str),
|
||||
click.option("--authenticate", is_flag=True, default=None),
|
||||
click.option("--config", type=str, default=None),
|
||||
click.option("--disable-module", multiple=True, default=None, type=str),
|
||||
click.option("--exclude-id", default=None, multiple=True),
|
||||
click.option("--exclude-id-file", default=None, multiple=True),
|
||||
click.option("--file-scheme", default=None, type=str),
|
||||
click.option("--filename-restriction-scheme", type=click.Choice(("linux", "windows")), default=None),
|
||||
click.option("--folder-scheme", default=None, type=str),
|
||||
click.option("--ignore-user", type=str, multiple=True, default=None),
|
||||
click.option("--include-id-file", multiple=True, default=None),
|
||||
click.option("--log", type=str, default=None),
|
||||
click.option("--opts", type=str, default=None),
|
||||
click.option("--saved", is_flag=True, default=None),
|
||||
click.option("--search", default=None, type=str),
|
||||
click.option("--submitted", is_flag=True, default=None),
|
||||
click.option("--subscribed", is_flag=True, default=None),
|
||||
click.option("--time-format", type=str, default=None),
|
||||
click.option("--upvoted", is_flag=True, default=None),
|
||||
click.option("-L", "--limit", default=None, type=int),
|
||||
click.option("-l", "--link", multiple=True, default=None, type=str),
|
||||
click.option("-m", "--multireddit", multiple=True, default=None, type=str),
|
||||
click.option(
|
||||
"-S", "--sort", type=click.Choice(("hot", "top", "new", "controversial", "rising", "relevance")), default=None
|
||||
),
|
||||
click.option("-s", "--subreddit", multiple=True, default=None, type=str),
|
||||
click.option("-t", "--time", type=click.Choice(("all", "hour", "day", "week", "month", "year")), default=None),
|
||||
click.option("-u", "--user", type=str, multiple=True, default=None),
|
||||
click.option("-v", "--verbose", default=None, count=True),
|
||||
]
|
||||
|
||||
_downloader_options = [
|
||||
click.option("--make-hard-links", is_flag=True, default=None),
|
||||
click.option("--max-wait-time", type=int, default=None),
|
||||
click.option("--no-dupes", is_flag=True, default=None),
|
||||
click.option("--search-existing", is_flag=True, default=None),
|
||||
click.option("--skip", default=None, multiple=True),
|
||||
click.option("--skip-domain", default=None, multiple=True),
|
||||
click.option("--skip-subreddit", default=None, multiple=True),
|
||||
click.option("--min-score", type=int, default=None),
|
||||
click.option("--max-score", type=int, default=None),
|
||||
click.option("--min-score-ratio", type=float, default=None),
|
||||
click.option("--max-score-ratio", type=float, default=None),
|
||||
]
|
||||
|
||||
_archiver_options = [
|
||||
click.option("--all-comments", is_flag=True, default=None),
|
||||
click.option("--comment-context", is_flag=True, default=None),
|
||||
click.option("-f", "--format", type=click.Choice(("xml", "json", "yaml")), default=None),
|
||||
]
|
||||
|
||||
|
||||
def _add_options(opts: list):
|
||||
def wrap(func):
|
||||
for opt in opts:
|
||||
func = opt(func)
|
||||
return func
|
||||
|
||||
return wrap
|
||||
|
||||
|
||||
def _check_version(context, param, value):
|
||||
if not value or context.resilient_parsing:
|
||||
return
|
||||
current = __version__
|
||||
latest = requests.get("https://pypi.org/pypi/bdfr/json").json()["info"]["version"]
|
||||
print(f"You are currently using v{current} the latest is v{latest}")
|
||||
context.exit()
|
||||
|
||||
|
||||
@click.group()
|
||||
@click.help_option("-h", "--help")
|
||||
@click.option(
|
||||
"--version",
|
||||
is_flag=True,
|
||||
is_eager=True,
|
||||
expose_value=False,
|
||||
callback=_check_version,
|
||||
help="Check version and exit.",
|
||||
)
|
||||
def cli():
|
||||
"""BDFR is used to download and archive content from Reddit."""
|
||||
pass
|
||||
|
||||
|
||||
@cli.command("download")
|
||||
@_add_options(_common_options)
|
||||
@_add_options(_downloader_options)
|
||||
@click.help_option("-h", "--help")
|
||||
@click.pass_context
|
||||
def cli_download(context: click.Context, **_):
|
||||
"""Used to download content posted to Reddit."""
|
||||
config = Configuration()
|
||||
config.process_click_arguments(context)
|
||||
silence_module_loggers()
|
||||
stream = make_console_logging_handler(config.verbose)
|
||||
try:
|
||||
reddit_downloader = RedditDownloader(config, [stream])
|
||||
reddit_downloader.download()
|
||||
except Exception:
|
||||
logger.exception("Downloader exited unexpectedly")
|
||||
raise
|
||||
else:
|
||||
logger.info("Program complete")
|
||||
|
||||
|
||||
@cli.command("archive")
|
||||
@_add_options(_common_options)
|
||||
@_add_options(_archiver_options)
|
||||
@click.help_option("-h", "--help")
|
||||
@click.pass_context
|
||||
def cli_archive(context: click.Context, **_):
|
||||
"""Used to archive post data from Reddit."""
|
||||
config = Configuration()
|
||||
config.process_click_arguments(context)
|
||||
silence_module_loggers()
|
||||
stream = make_console_logging_handler(config.verbose)
|
||||
try:
|
||||
reddit_archiver = Archiver(config, [stream])
|
||||
reddit_archiver.download()
|
||||
except Exception:
|
||||
logger.exception("Archiver exited unexpectedly")
|
||||
raise
|
||||
else:
|
||||
logger.info("Program complete")
|
||||
|
||||
|
||||
@cli.command("clone")
|
||||
@_add_options(_common_options)
|
||||
@_add_options(_archiver_options)
|
||||
@_add_options(_downloader_options)
|
||||
@click.help_option("-h", "--help")
|
||||
@click.pass_context
|
||||
def cli_clone(context: click.Context, **_):
|
||||
"""Combines archive and download commands."""
|
||||
config = Configuration()
|
||||
config.process_click_arguments(context)
|
||||
silence_module_loggers()
|
||||
stream = make_console_logging_handler(config.verbose)
|
||||
try:
|
||||
reddit_scraper = RedditCloner(config, [stream])
|
||||
reddit_scraper.download()
|
||||
except Exception:
|
||||
logger.exception("Scraper exited unexpectedly")
|
||||
raise
|
||||
else:
|
||||
logger.info("Program complete")
|
||||
|
||||
|
||||
@cli.command("completion")
|
||||
@click.argument("shell", type=click.Choice(("all", "bash", "fish", "zsh"), case_sensitive=False), default="all")
|
||||
@click.help_option("-h", "--help")
|
||||
@click.option("-u", "--uninstall", is_flag=True, default=False, help="Uninstall completion")
|
||||
def cli_completion(shell: str, uninstall: bool):
|
||||
"""\b
|
||||
Installs shell completions for BDFR.
|
||||
Options: all, bash, fish, zsh
|
||||
Default: all"""
|
||||
shell = shell.lower()
|
||||
if sys.platform == "win32":
|
||||
print("Completions are not currently supported on Windows.")
|
||||
return
|
||||
if uninstall and click.confirm(f"Would you like to uninstall {shell} completions for BDFR"):
|
||||
Completion(shell).uninstall()
|
||||
return
|
||||
if shell not in ("all", "bash", "fish", "zsh"):
|
||||
print(f"{shell} is not a valid option.")
|
||||
print("Options: all, bash, fish, zsh")
|
||||
return
|
||||
if click.confirm(f"Would you like to install {shell} completions for BDFR"):
|
||||
Completion(shell).install()
|
||||
|
||||
|
||||
def make_console_logging_handler(verbosity: int) -> logging.StreamHandler:
|
||||
class StreamExceptionFilter(logging.Filter):
|
||||
def filter(self, record: logging.LogRecord) -> bool:
|
||||
result = not (record.levelno == logging.ERROR and record.exc_info)
|
||||
return result
|
||||
|
||||
logger.setLevel(1)
|
||||
stream = logging.StreamHandler(sys.stdout)
|
||||
stream.addFilter(StreamExceptionFilter())
|
||||
|
||||
formatter = logging.Formatter("[%(asctime)s - %(name)s - %(levelname)s] - %(message)s")
|
||||
stream.setFormatter(formatter)
|
||||
|
||||
if verbosity <= 0:
|
||||
stream.setLevel(logging.INFO)
|
||||
elif verbosity == 1:
|
||||
stream.setLevel(logging.DEBUG)
|
||||
else:
|
||||
stream.setLevel(9)
|
||||
return stream
|
||||
|
||||
|
||||
def silence_module_loggers():
|
||||
logging.getLogger("praw").setLevel(logging.CRITICAL)
|
||||
logging.getLogger("prawcore").setLevel(logging.CRITICAL)
|
||||
logging.getLogger("urllib3").setLevel(logging.CRITICAL)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
cli()
|
2
bdfr/archive_entry/__init__.py
Normal file
2
bdfr/archive_entry/__init__.py
Normal file
|
@ -0,0 +1,2 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
39
bdfr/archive_entry/base_archive_entry.py
Normal file
39
bdfr/archive_entry/base_archive_entry.py
Normal file
|
@ -0,0 +1,39 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Union
|
||||
|
||||
from praw.models import Comment, Submission
|
||||
|
||||
|
||||
class BaseArchiveEntry(ABC):
|
||||
def __init__(self, source: Union[Comment, Submission]):
|
||||
self.source = source
|
||||
self.post_details: dict = {}
|
||||
|
||||
@abstractmethod
|
||||
def compile(self) -> dict:
|
||||
raise NotImplementedError
|
||||
|
||||
@staticmethod
|
||||
def _convert_comment_to_dict(in_comment: Comment) -> dict:
|
||||
out_dict = {
|
||||
"author": in_comment.author.name if in_comment.author else "DELETED",
|
||||
"id": in_comment.id,
|
||||
"score": in_comment.score,
|
||||
"subreddit": in_comment.subreddit.display_name,
|
||||
"author_flair": in_comment.author_flair_text,
|
||||
"submission": in_comment.submission.id,
|
||||
"stickied": in_comment.stickied,
|
||||
"body": in_comment.body,
|
||||
"is_submitter": in_comment.is_submitter,
|
||||
"distinguished": in_comment.distinguished,
|
||||
"created_utc": in_comment.created_utc,
|
||||
"parent_id": in_comment.parent_id,
|
||||
"replies": [],
|
||||
}
|
||||
in_comment.replies.replace_more(limit=None)
|
||||
for reply in in_comment.replies:
|
||||
out_dict["replies"].append(BaseArchiveEntry._convert_comment_to_dict(reply))
|
||||
return out_dict
|
21
bdfr/archive_entry/comment_archive_entry.py
Normal file
21
bdfr/archive_entry/comment_archive_entry.py
Normal file
|
@ -0,0 +1,21 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import logging
|
||||
|
||||
import praw.models
|
||||
|
||||
from bdfr.archive_entry.base_archive_entry import BaseArchiveEntry
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class CommentArchiveEntry(BaseArchiveEntry):
|
||||
def __init__(self, comment: praw.models.Comment):
|
||||
super(CommentArchiveEntry, self).__init__(comment)
|
||||
|
||||
def compile(self) -> dict:
|
||||
self.source.refresh()
|
||||
self.post_details = self._convert_comment_to_dict(self.source)
|
||||
self.post_details["submission_title"] = self.source.submission.title
|
||||
return self.post_details
|
51
bdfr/archive_entry/submission_archive_entry.py
Normal file
51
bdfr/archive_entry/submission_archive_entry.py
Normal file
|
@ -0,0 +1,51 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import logging
|
||||
|
||||
import praw.models
|
||||
|
||||
from bdfr.archive_entry.base_archive_entry import BaseArchiveEntry
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class SubmissionArchiveEntry(BaseArchiveEntry):
|
||||
def __init__(self, submission: praw.models.Submission):
|
||||
super(SubmissionArchiveEntry, self).__init__(submission)
|
||||
|
||||
def compile(self) -> dict:
|
||||
comments = self._get_comments()
|
||||
self._get_post_details()
|
||||
out = self.post_details
|
||||
out["comments"] = comments
|
||||
return out
|
||||
|
||||
def _get_post_details(self):
|
||||
self.post_details = {
|
||||
"title": self.source.title,
|
||||
"name": self.source.name,
|
||||
"url": self.source.url,
|
||||
"selftext": self.source.selftext,
|
||||
"score": self.source.score,
|
||||
"upvote_ratio": self.source.upvote_ratio,
|
||||
"permalink": self.source.permalink,
|
||||
"id": self.source.id,
|
||||
"author": self.source.author.name if self.source.author else "DELETED",
|
||||
"link_flair_text": self.source.link_flair_text,
|
||||
"num_comments": self.source.num_comments,
|
||||
"over_18": self.source.over_18,
|
||||
"spoiler": self.source.spoiler,
|
||||
"pinned": self.source.pinned,
|
||||
"locked": self.source.locked,
|
||||
"distinguished": self.source.distinguished,
|
||||
"created_utc": self.source.created_utc,
|
||||
}
|
||||
|
||||
def _get_comments(self) -> list[dict]:
|
||||
logger.debug(f"Retrieving full comment tree for submission {self.source.id}")
|
||||
comments = []
|
||||
self.source.comments.replace_more(limit=None)
|
||||
for top_level_comment in self.source.comments:
|
||||
comments.append(self._convert_comment_to_dict(top_level_comment))
|
||||
return comments
|
124
bdfr/archiver.py
Normal file
124
bdfr/archiver.py
Normal file
|
@ -0,0 +1,124 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
from collections.abc import Iterable, Iterator
|
||||
from pathlib import Path
|
||||
from time import sleep
|
||||
from typing import Union
|
||||
|
||||
import dict2xml
|
||||
import praw.models
|
||||
import prawcore
|
||||
import yaml
|
||||
|
||||
from bdfr.archive_entry.base_archive_entry import BaseArchiveEntry
|
||||
from bdfr.archive_entry.comment_archive_entry import CommentArchiveEntry
|
||||
from bdfr.archive_entry.submission_archive_entry import SubmissionArchiveEntry
|
||||
from bdfr.configuration import Configuration
|
||||
from bdfr.connector import RedditConnector
|
||||
from bdfr.exceptions import ArchiverError
|
||||
from bdfr.resource import Resource
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class Archiver(RedditConnector):
|
||||
def __init__(self, args: Configuration, logging_handlers: Iterable[logging.Handler] = ()):
|
||||
super(Archiver, self).__init__(args, logging_handlers)
|
||||
|
||||
def download(self):
|
||||
for generator in self.reddit_lists:
|
||||
try:
|
||||
for submission in generator:
|
||||
try:
|
||||
if (submission.author and submission.author.name in self.args.ignore_user) or (
|
||||
submission.author is None and "DELETED" in self.args.ignore_user
|
||||
):
|
||||
logger.debug(
|
||||
f"Submission {submission.id} in {submission.subreddit.display_name} skipped due to"
|
||||
f" {submission.author.name if submission.author else 'DELETED'} being an ignored user"
|
||||
)
|
||||
continue
|
||||
if submission.id in self.excluded_submission_ids:
|
||||
logger.debug(f"Object {submission.id} in exclusion list, skipping")
|
||||
continue
|
||||
logger.debug(f"Attempting to archive submission {submission.id}")
|
||||
self.write_entry(submission)
|
||||
except prawcore.PrawcoreException as e:
|
||||
logger.error(f"Submission {submission.id} failed to be archived due to a PRAW exception: {e}")
|
||||
except prawcore.PrawcoreException as e:
|
||||
logger.error(f"The submission after {submission.id} failed to download due to a PRAW exception: {e}")
|
||||
logger.debug("Waiting 60 seconds to continue")
|
||||
sleep(60)
|
||||
|
||||
def get_submissions_from_link(self) -> list[list[praw.models.Submission]]:
|
||||
supplied_submissions = []
|
||||
for sub_id in self.args.link:
|
||||
if len(sub_id) == 6:
|
||||
supplied_submissions.append(self.reddit_instance.submission(id=sub_id))
|
||||
elif re.match(r"^\w{7}$", sub_id):
|
||||
supplied_submissions.append(self.reddit_instance.comment(id=sub_id))
|
||||
else:
|
||||
supplied_submissions.append(self.reddit_instance.submission(url=sub_id))
|
||||
return [supplied_submissions]
|
||||
|
||||
def get_user_data(self) -> list[Iterator]:
|
||||
results = super(Archiver, self).get_user_data()
|
||||
if self.args.user and self.args.all_comments:
|
||||
sort = self.determine_sort_function()
|
||||
for user in self.args.user:
|
||||
logger.debug(f"Retrieving comments of user {user}")
|
||||
results.append(sort(self.reddit_instance.redditor(user).comments, limit=self.args.limit))
|
||||
return results
|
||||
|
||||
@staticmethod
|
||||
def _pull_lever_entry_factory(praw_item: Union[praw.models.Submission, praw.models.Comment]) -> BaseArchiveEntry:
|
||||
if isinstance(praw_item, praw.models.Submission):
|
||||
return SubmissionArchiveEntry(praw_item)
|
||||
elif isinstance(praw_item, praw.models.Comment):
|
||||
return CommentArchiveEntry(praw_item)
|
||||
else:
|
||||
raise ArchiverError(f"Factory failed to classify item of type {type(praw_item).__name__}")
|
||||
|
||||
def write_entry(self, praw_item: Union[praw.models.Submission, praw.models.Comment]):
|
||||
if self.args.comment_context and isinstance(praw_item, praw.models.Comment):
|
||||
logger.debug(f"Converting comment {praw_item.id} to submission {praw_item.submission.id}")
|
||||
praw_item = praw_item.submission
|
||||
archive_entry = self._pull_lever_entry_factory(praw_item)
|
||||
if self.args.format == "json":
|
||||
self._write_entry_json(archive_entry)
|
||||
elif self.args.format == "xml":
|
||||
self._write_entry_xml(archive_entry)
|
||||
elif self.args.format == "yaml":
|
||||
self._write_entry_yaml(archive_entry)
|
||||
else:
|
||||
raise ArchiverError(f"Unknown format {self.args.format} given")
|
||||
logger.info(f"Record for entry item {praw_item.id} written to disk")
|
||||
|
||||
def _write_entry_json(self, entry: BaseArchiveEntry):
|
||||
resource = Resource(entry.source, "", lambda: None, ".json")
|
||||
content = json.dumps(entry.compile())
|
||||
self._write_content_to_disk(resource, content)
|
||||
|
||||
def _write_entry_xml(self, entry: BaseArchiveEntry):
|
||||
resource = Resource(entry.source, "", lambda: None, ".xml")
|
||||
content = dict2xml.dict2xml(entry.compile(), wrap="root")
|
||||
self._write_content_to_disk(resource, content)
|
||||
|
||||
def _write_entry_yaml(self, entry: BaseArchiveEntry):
|
||||
resource = Resource(entry.source, "", lambda: None, ".yaml")
|
||||
content = yaml.safe_dump(entry.compile())
|
||||
self._write_content_to_disk(resource, content)
|
||||
|
||||
def _write_content_to_disk(self, resource: Resource, content: str):
|
||||
file_path = self.file_name_formatter.format_path(resource, self.download_directory)
|
||||
file_path.parent.mkdir(exist_ok=True, parents=True)
|
||||
with Path(file_path).open(mode="w", encoding="utf-8") as file:
|
||||
logger.debug(
|
||||
f"Writing entry {resource.source_submission.id} to file in {resource.extension[1:].upper()}"
|
||||
f" format at {file_path}"
|
||||
)
|
||||
file.write(content)
|
33
bdfr/cloner.py
Normal file
33
bdfr/cloner.py
Normal file
|
@ -0,0 +1,33 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import logging
|
||||
from collections.abc import Iterable
|
||||
from time import sleep
|
||||
|
||||
import prawcore
|
||||
|
||||
from bdfr.archiver import Archiver
|
||||
from bdfr.configuration import Configuration
|
||||
from bdfr.downloader import RedditDownloader
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class RedditCloner(RedditDownloader, Archiver):
|
||||
def __init__(self, args: Configuration, logging_handlers: Iterable[logging.Handler] = ()):
|
||||
super(RedditCloner, self).__init__(args, logging_handlers)
|
||||
|
||||
def download(self):
|
||||
for generator in self.reddit_lists:
|
||||
try:
|
||||
for submission in generator:
|
||||
try:
|
||||
self._download_submission(submission)
|
||||
self.write_entry(submission)
|
||||
except prawcore.PrawcoreException as e:
|
||||
logger.error(f"Submission {submission.id} failed to be cloned due to a PRAW exception: {e}")
|
||||
except prawcore.PrawcoreException as e:
|
||||
logger.error(f"The submission after {submission.id} failed to download due to a PRAW exception: {e}")
|
||||
logger.debug("Waiting 60 seconds to continue")
|
||||
sleep(60)
|
68
bdfr/completion.py
Normal file
68
bdfr/completion.py
Normal file
|
@ -0,0 +1,68 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import subprocess
|
||||
from os import environ
|
||||
from pathlib import Path
|
||||
|
||||
import appdirs
|
||||
|
||||
|
||||
class Completion:
|
||||
def __init__(self, shell: str):
|
||||
self.shell = shell
|
||||
self.env = environ.copy()
|
||||
self.share_dir = appdirs.user_data_dir()
|
||||
self.entry_points = ["bdfr", "bdfr-archive", "bdfr-clone", "bdfr-download"]
|
||||
|
||||
def install(self):
|
||||
if self.shell in ("all", "bash"):
|
||||
comp_dir = self.share_dir + "/bash-completion/completions/"
|
||||
if not Path(comp_dir).exists():
|
||||
print("Creating Bash completion directory.")
|
||||
Path(comp_dir).mkdir(parents=True, exist_ok=True)
|
||||
for point in self.entry_points:
|
||||
self.env[f"_{point.upper().replace('-', '_')}_COMPLETE"] = "bash_source"
|
||||
with Path(comp_dir + point).open(mode="w") as file:
|
||||
file.write(subprocess.run([point], env=self.env, capture_output=True, text=True).stdout)
|
||||
print(f"Bash completion for {point} written to {comp_dir}{point}")
|
||||
if self.shell in ("all", "fish"):
|
||||
comp_dir = self.share_dir + "/fish/vendor_completions.d/"
|
||||
if not Path(comp_dir).exists():
|
||||
print("Creating Fish completion directory.")
|
||||
Path(comp_dir).mkdir(parents=True, exist_ok=True)
|
||||
for point in self.entry_points:
|
||||
self.env[f"_{point.upper().replace('-', '_')}_COMPLETE"] = "fish_source"
|
||||
with Path(comp_dir + point + ".fish").open(mode="w") as file:
|
||||
file.write(subprocess.run([point], env=self.env, capture_output=True, text=True).stdout)
|
||||
print(f"Fish completion for {point} written to {comp_dir}{point}.fish")
|
||||
if self.shell in ("all", "zsh"):
|
||||
comp_dir = self.share_dir + "/zsh/site-functions/"
|
||||
if not Path(comp_dir).exists():
|
||||
print("Creating Zsh completion directory.")
|
||||
Path(comp_dir).mkdir(parents=True, exist_ok=True)
|
||||
for point in self.entry_points:
|
||||
self.env[f"_{point.upper().replace('-', '_')}_COMPLETE"] = "zsh_source"
|
||||
with Path(comp_dir + "_" + point).open(mode="w") as file:
|
||||
file.write(subprocess.run([point], env=self.env, capture_output=True, text=True).stdout)
|
||||
print(f"Zsh completion for {point} written to {comp_dir}_{point}")
|
||||
|
||||
def uninstall(self):
|
||||
if self.shell in ("all", "bash"):
|
||||
comp_dir = self.share_dir + "/bash-completion/completions/"
|
||||
for point in self.entry_points:
|
||||
if Path(comp_dir + point).exists():
|
||||
Path(comp_dir + point).unlink()
|
||||
print(f"Bash completion for {point} removed from {comp_dir}{point}")
|
||||
if self.shell in ("all", "fish"):
|
||||
comp_dir = self.share_dir + "/fish/vendor_completions.d/"
|
||||
for point in self.entry_points:
|
||||
if Path(comp_dir + point + ".fish").exists():
|
||||
Path(comp_dir + point + ".fish").unlink()
|
||||
print(f"Fish completion for {point} removed from {comp_dir}{point}.fish")
|
||||
if self.shell in ("all", "zsh"):
|
||||
comp_dir = self.share_dir + "/zsh/site-functions/"
|
||||
for point in self.entry_points:
|
||||
if Path(comp_dir + "_" + point).exists():
|
||||
Path(comp_dir + "_" + point).unlink()
|
||||
print(f"Zsh completion for {point} removed from {comp_dir}_{point}")
|
90
bdfr/configuration.py
Normal file
90
bdfr/configuration.py
Normal file
|
@ -0,0 +1,90 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import logging
|
||||
from argparse import Namespace
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import click
|
||||
import yaml
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class Configuration(Namespace):
|
||||
def __init__(self):
|
||||
super(Configuration, self).__init__()
|
||||
self.authenticate = False
|
||||
self.config = None
|
||||
self.opts: Optional[str] = None
|
||||
self.directory: str = "."
|
||||
self.disable_module: list[str] = []
|
||||
self.exclude_id = []
|
||||
self.exclude_id_file = []
|
||||
self.file_scheme: str = "{REDDITOR}_{TITLE}_{POSTID}"
|
||||
self.filename_restriction_scheme = None
|
||||
self.folder_scheme: str = "{SUBREDDIT}"
|
||||
self.ignore_user = []
|
||||
self.include_id_file = []
|
||||
self.limit: Optional[int] = None
|
||||
self.link: list[str] = []
|
||||
self.log: Optional[str] = None
|
||||
self.make_hard_links = False
|
||||
self.max_wait_time = None
|
||||
self.multireddit: list[str] = []
|
||||
self.no_dupes: bool = False
|
||||
self.saved: bool = False
|
||||
self.search: Optional[str] = None
|
||||
self.search_existing: bool = False
|
||||
self.skip: list[str] = []
|
||||
self.skip_domain: list[str] = []
|
||||
self.skip_subreddit: list[str] = []
|
||||
self.min_score = None
|
||||
self.max_score = None
|
||||
self.min_score_ratio = None
|
||||
self.max_score_ratio = None
|
||||
self.sort: str = "hot"
|
||||
self.submitted: bool = False
|
||||
self.subscribed: bool = False
|
||||
self.subreddit: list[str] = []
|
||||
self.time: str = "all"
|
||||
self.time_format = None
|
||||
self.upvoted: bool = False
|
||||
self.user: list[str] = []
|
||||
self.verbose: int = 0
|
||||
|
||||
# Archiver-specific options
|
||||
self.all_comments = False
|
||||
self.format = "json"
|
||||
self.comment_context: bool = False
|
||||
|
||||
def process_click_arguments(self, context: click.Context):
|
||||
if context.params.get("opts") is not None:
|
||||
self.parse_yaml_options(context.params["opts"])
|
||||
for arg_key in context.params.keys():
|
||||
if not hasattr(self, arg_key):
|
||||
logger.warning(f"Ignoring an unknown CLI argument: {arg_key}")
|
||||
continue
|
||||
val = context.params[arg_key]
|
||||
if val is None or val == ():
|
||||
# don't overwrite with an empty value
|
||||
continue
|
||||
setattr(self, arg_key, val)
|
||||
|
||||
def parse_yaml_options(self, file_path: str):
|
||||
yaml_file_loc = Path(file_path)
|
||||
if not yaml_file_loc.exists():
|
||||
logger.error(f"No YAML file found at {yaml_file_loc}")
|
||||
return
|
||||
with yaml_file_loc.open() as file:
|
||||
try:
|
||||
opts = yaml.safe_load(file)
|
||||
except yaml.YAMLError as e:
|
||||
logger.error(f"Could not parse YAML options file: {e}")
|
||||
return
|
||||
for arg_key, val in opts.items():
|
||||
if not hasattr(self, arg_key):
|
||||
logger.warning(f"Ignoring an unknown YAML argument: {arg_key}")
|
||||
continue
|
||||
setattr(self, arg_key, val)
|
457
bdfr/connector.py
Normal file
457
bdfr/connector.py
Normal file
|
@ -0,0 +1,457 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import configparser
|
||||
import importlib.resources
|
||||
import itertools
|
||||
import logging
|
||||
import logging.handlers
|
||||
import re
|
||||
import shutil
|
||||
import socket
|
||||
from abc import ABCMeta, abstractmethod
|
||||
from collections.abc import Callable, Iterable, Iterator
|
||||
from datetime import datetime
|
||||
from enum import Enum, auto
|
||||
from pathlib import Path
|
||||
from time import sleep
|
||||
|
||||
import appdirs
|
||||
import praw
|
||||
import praw.exceptions
|
||||
import praw.models
|
||||
import prawcore
|
||||
|
||||
from bdfr import exceptions as errors
|
||||
from bdfr.configuration import Configuration
|
||||
from bdfr.download_filter import DownloadFilter
|
||||
from bdfr.file_name_formatter import FileNameFormatter
|
||||
from bdfr.oauth2 import OAuth2Authenticator, OAuth2TokenManager
|
||||
from bdfr.site_authenticator import SiteAuthenticator
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class RedditTypes:
|
||||
class SortType(Enum):
|
||||
CONTROVERSIAL = auto()
|
||||
HOT = auto()
|
||||
NEW = auto()
|
||||
RELEVENCE = auto()
|
||||
RISING = auto()
|
||||
TOP = auto()
|
||||
|
||||
class TimeType(Enum):
|
||||
ALL = "all"
|
||||
DAY = "day"
|
||||
HOUR = "hour"
|
||||
MONTH = "month"
|
||||
WEEK = "week"
|
||||
YEAR = "year"
|
||||
|
||||
|
||||
class RedditConnector(metaclass=ABCMeta):
|
||||
def __init__(self, args: Configuration, logging_handlers: Iterable[logging.Handler] = ()):
|
||||
self.args = args
|
||||
self.config_directories = appdirs.AppDirs("bdfr", "BDFR")
|
||||
self.determine_directories()
|
||||
self.load_config()
|
||||
self.read_config()
|
||||
file_log = self.create_file_logger()
|
||||
self._apply_logging_handlers(itertools.chain(logging_handlers, [file_log]))
|
||||
self.run_time = datetime.now().isoformat()
|
||||
self._setup_internal_objects()
|
||||
|
||||
self.reddit_lists = self.retrieve_reddit_lists()
|
||||
|
||||
def _setup_internal_objects(self):
|
||||
|
||||
self.parse_disabled_modules()
|
||||
|
||||
self.download_filter = self.create_download_filter()
|
||||
logger.log(9, "Created download filter")
|
||||
self.time_filter = self.create_time_filter()
|
||||
logger.log(9, "Created time filter")
|
||||
self.sort_filter = self.create_sort_filter()
|
||||
logger.log(9, "Created sort filter")
|
||||
self.file_name_formatter = self.create_file_name_formatter()
|
||||
logger.log(9, "Create file name formatter")
|
||||
|
||||
self.create_reddit_instance()
|
||||
self.args.user = list(filter(None, [self.resolve_user_name(user) for user in self.args.user]))
|
||||
|
||||
self.excluded_submission_ids = set.union(
|
||||
self.read_id_files(self.args.exclude_id_file),
|
||||
set(self.args.exclude_id),
|
||||
)
|
||||
|
||||
self.args.link = list(itertools.chain(self.args.link, self.read_id_files(self.args.include_id_file)))
|
||||
|
||||
self.master_hash_list = {}
|
||||
self.authenticator = self.create_authenticator()
|
||||
logger.log(9, "Created site authenticator")
|
||||
|
||||
self.args.skip_subreddit = self.split_args_input(self.args.skip_subreddit)
|
||||
self.args.skip_subreddit = {sub.lower() for sub in self.args.skip_subreddit}
|
||||
|
||||
@staticmethod
|
||||
def _apply_logging_handlers(handlers: Iterable[logging.Handler]):
|
||||
main_logger = logging.getLogger()
|
||||
for handler in handlers:
|
||||
main_logger.addHandler(handler)
|
||||
|
||||
def read_config(self):
|
||||
"""Read any cfg values that need to be processed"""
|
||||
if self.args.max_wait_time is None:
|
||||
self.args.max_wait_time = self.cfg_parser.getint("DEFAULT", "max_wait_time", fallback=120)
|
||||
logger.debug(f"Setting maximum download wait time to {self.args.max_wait_time} seconds")
|
||||
if self.args.time_format is None:
|
||||
option = self.cfg_parser.get("DEFAULT", "time_format", fallback="ISO")
|
||||
if re.match(r"^[\s\'\"]*$", option):
|
||||
option = "ISO"
|
||||
logger.debug(f"Setting datetime format string to {option}")
|
||||
self.args.time_format = option
|
||||
if not self.args.disable_module:
|
||||
self.args.disable_module = [self.cfg_parser.get("DEFAULT", "disabled_modules", fallback="")]
|
||||
if not self.args.filename_restriction_scheme:
|
||||
self.args.filename_restriction_scheme = self.cfg_parser.get(
|
||||
"DEFAULT", "filename_restriction_scheme", fallback=None
|
||||
)
|
||||
logger.debug(f"Setting filename restriction scheme to '{self.args.filename_restriction_scheme}'")
|
||||
# Update config on disk
|
||||
with Path(self.config_location).open(mode="w") as file:
|
||||
self.cfg_parser.write(file)
|
||||
|
||||
def parse_disabled_modules(self):
|
||||
disabled_modules = self.args.disable_module
|
||||
disabled_modules = self.split_args_input(disabled_modules)
|
||||
disabled_modules = {name.strip().lower() for name in disabled_modules}
|
||||
self.args.disable_module = disabled_modules
|
||||
logger.debug(f'Disabling the following modules: {", ".join(self.args.disable_module)}')
|
||||
|
||||
def create_reddit_instance(self):
|
||||
if self.args.authenticate:
|
||||
logger.debug("Using authenticated Reddit instance")
|
||||
if not self.cfg_parser.has_option("DEFAULT", "user_token"):
|
||||
logger.log(9, "Commencing OAuth2 authentication")
|
||||
scopes = self.cfg_parser.get("DEFAULT", "scopes", fallback="identity, history, read, save")
|
||||
scopes = OAuth2Authenticator.split_scopes(scopes)
|
||||
oauth2_authenticator = OAuth2Authenticator(
|
||||
scopes,
|
||||
self.cfg_parser.get("DEFAULT", "client_id"),
|
||||
self.cfg_parser.get("DEFAULT", "client_secret"),
|
||||
)
|
||||
token = oauth2_authenticator.retrieve_new_token()
|
||||
self.cfg_parser["DEFAULT"]["user_token"] = token
|
||||
with Path(self.config_location).open(mode="w") as file:
|
||||
self.cfg_parser.write(file, True)
|
||||
token_manager = OAuth2TokenManager(self.cfg_parser, self.config_location)
|
||||
|
||||
self.authenticated = True
|
||||
self.reddit_instance = praw.Reddit(
|
||||
client_id=self.cfg_parser.get("DEFAULT", "client_id"),
|
||||
client_secret=self.cfg_parser.get("DEFAULT", "client_secret"),
|
||||
user_agent=socket.gethostname(),
|
||||
token_manager=token_manager,
|
||||
)
|
||||
else:
|
||||
logger.debug("Using unauthenticated Reddit instance")
|
||||
self.authenticated = False
|
||||
self.reddit_instance = praw.Reddit(
|
||||
client_id=self.cfg_parser.get("DEFAULT", "client_id"),
|
||||
client_secret=self.cfg_parser.get("DEFAULT", "client_secret"),
|
||||
user_agent=socket.gethostname(),
|
||||
)
|
||||
|
||||
def retrieve_reddit_lists(self) -> list[praw.models.ListingGenerator]:
|
||||
master_list = []
|
||||
master_list.extend(self.get_subreddits())
|
||||
logger.log(9, "Retrieved subreddits")
|
||||
master_list.extend(self.get_multireddits())
|
||||
logger.log(9, "Retrieved multireddits")
|
||||
master_list.extend(self.get_user_data())
|
||||
logger.log(9, "Retrieved user data")
|
||||
master_list.extend(self.get_submissions_from_link())
|
||||
logger.log(9, "Retrieved submissions for given links")
|
||||
return master_list
|
||||
|
||||
def determine_directories(self):
|
||||
self.download_directory = Path(self.args.directory).resolve().expanduser()
|
||||
self.config_directory = Path(self.config_directories.user_config_dir)
|
||||
|
||||
self.download_directory.mkdir(exist_ok=True, parents=True)
|
||||
self.config_directory.mkdir(exist_ok=True, parents=True)
|
||||
|
||||
def load_config(self):
|
||||
self.cfg_parser = configparser.ConfigParser()
|
||||
if self.args.config:
|
||||
if (cfg_path := Path(self.args.config)).exists():
|
||||
self.cfg_parser.read(cfg_path)
|
||||
self.config_location = cfg_path
|
||||
return
|
||||
possible_paths = [
|
||||
Path("./config.cfg"),
|
||||
Path("./default_config.cfg"),
|
||||
Path(self.config_directory, "config.cfg"),
|
||||
Path(self.config_directory, "default_config.cfg"),
|
||||
]
|
||||
self.config_location = None
|
||||
for path in possible_paths:
|
||||
if path.resolve().expanduser().exists():
|
||||
self.config_location = path
|
||||
logger.debug(f"Loading configuration from {path}")
|
||||
break
|
||||
if not self.config_location:
|
||||
with importlib.resources.path("bdfr", "default_config.cfg") as path:
|
||||
self.config_location = path
|
||||
shutil.copy(self.config_location, Path(self.config_directory, "default_config.cfg"))
|
||||
if not self.config_location:
|
||||
raise errors.BulkDownloaderException("Could not find a configuration file to load")
|
||||
self.cfg_parser.read(self.config_location)
|
||||
|
||||
def create_file_logger(self) -> logging.handlers.RotatingFileHandler:
|
||||
if self.args.log is None:
|
||||
log_path = Path(self.config_directory, "log_output.txt")
|
||||
else:
|
||||
log_path = Path(self.args.log).resolve().expanduser()
|
||||
if not log_path.parent.exists():
|
||||
raise errors.BulkDownloaderException("Designated location for logfile does not exist")
|
||||
backup_count = self.cfg_parser.getint("DEFAULT", "backup_log_count", fallback=3)
|
||||
file_handler = logging.handlers.RotatingFileHandler(
|
||||
log_path,
|
||||
mode="a",
|
||||
backupCount=backup_count,
|
||||
)
|
||||
if log_path.exists():
|
||||
try:
|
||||
file_handler.doRollover()
|
||||
except PermissionError:
|
||||
logger.critical(
|
||||
"Cannot rollover logfile, make sure this is the only "
|
||||
"BDFR process or specify alternate logfile location"
|
||||
)
|
||||
raise
|
||||
formatter = logging.Formatter("[%(asctime)s - %(name)s - %(levelname)s] - %(message)s")
|
||||
file_handler.setFormatter(formatter)
|
||||
file_handler.setLevel(0)
|
||||
return file_handler
|
||||
|
||||
@staticmethod
|
||||
def sanitise_subreddit_name(subreddit: str) -> str:
|
||||
pattern = re.compile(r"^(?:https://www\.reddit\.com/)?(?:r/)?(.*?)/?$")
|
||||
match = re.match(pattern, subreddit)
|
||||
if not match:
|
||||
raise errors.BulkDownloaderException(f"Could not find subreddit name in string {subreddit}")
|
||||
return match.group(1)
|
||||
|
||||
@staticmethod
|
||||
def split_args_input(entries: list[str]) -> set[str]:
|
||||
all_entries = []
|
||||
split_pattern = re.compile(r"[,;]\s?")
|
||||
for entry in entries:
|
||||
results = re.split(split_pattern, entry)
|
||||
all_entries.extend([RedditConnector.sanitise_subreddit_name(name) for name in results])
|
||||
return set(all_entries)
|
||||
|
||||
def get_subreddits(self) -> list[praw.models.ListingGenerator]:
|
||||
out = []
|
||||
subscribed_subreddits = set()
|
||||
if self.args.subscribed:
|
||||
if self.args.authenticate:
|
||||
try:
|
||||
subscribed_subreddits = list(self.reddit_instance.user.subreddits(limit=None))
|
||||
subscribed_subreddits = {s.display_name for s in subscribed_subreddits}
|
||||
except prawcore.InsufficientScope:
|
||||
logger.error("BDFR has insufficient scope to access subreddit lists")
|
||||
else:
|
||||
logger.error("Cannot find subscribed subreddits without an authenticated instance")
|
||||
if self.args.subreddit or subscribed_subreddits:
|
||||
for reddit in self.split_args_input(self.args.subreddit) | subscribed_subreddits:
|
||||
if reddit == "friends" and self.authenticated is False:
|
||||
logger.error("Cannot read friends subreddit without an authenticated instance")
|
||||
continue
|
||||
try:
|
||||
reddit = self.reddit_instance.subreddit(reddit)
|
||||
try:
|
||||
self.check_subreddit_status(reddit)
|
||||
except errors.BulkDownloaderException as e:
|
||||
logger.error(e)
|
||||
continue
|
||||
if self.args.search:
|
||||
out.append(
|
||||
reddit.search(
|
||||
self.args.search,
|
||||
sort=self.sort_filter.name.lower(),
|
||||
limit=self.args.limit,
|
||||
time_filter=self.time_filter.value,
|
||||
)
|
||||
)
|
||||
logger.debug(
|
||||
f'Added submissions from subreddit {reddit} with the search term "{self.args.search}"'
|
||||
)
|
||||
else:
|
||||
out.append(self.create_filtered_listing_generator(reddit))
|
||||
logger.debug(f"Added submissions from subreddit {reddit}")
|
||||
except (errors.BulkDownloaderException, praw.exceptions.PRAWException) as e:
|
||||
logger.error(f"Failed to get submissions for subreddit {reddit}: {e}")
|
||||
return out
|
||||
|
||||
def resolve_user_name(self, in_name: str) -> str:
|
||||
if in_name == "me":
|
||||
if self.authenticated:
|
||||
resolved_name = self.reddit_instance.user.me().name
|
||||
logger.log(9, f"Resolved user to {resolved_name}")
|
||||
return resolved_name
|
||||
else:
|
||||
logger.warning('To use "me" as a user, an authenticated Reddit instance must be used')
|
||||
else:
|
||||
return in_name
|
||||
|
||||
def get_submissions_from_link(self) -> list[list[praw.models.Submission]]:
|
||||
supplied_submissions = []
|
||||
for sub_id in self.args.link:
|
||||
if len(sub_id) in (6, 7):
|
||||
supplied_submissions.append(self.reddit_instance.submission(id=sub_id))
|
||||
else:
|
||||
supplied_submissions.append(self.reddit_instance.submission(url=sub_id))
|
||||
return [supplied_submissions]
|
||||
|
||||
def determine_sort_function(self) -> Callable:
|
||||
if self.sort_filter is RedditTypes.SortType.NEW:
|
||||
sort_function = praw.models.Subreddit.new
|
||||
elif self.sort_filter is RedditTypes.SortType.RISING:
|
||||
sort_function = praw.models.Subreddit.rising
|
||||
elif self.sort_filter is RedditTypes.SortType.CONTROVERSIAL:
|
||||
sort_function = praw.models.Subreddit.controversial
|
||||
elif self.sort_filter is RedditTypes.SortType.TOP:
|
||||
sort_function = praw.models.Subreddit.top
|
||||
else:
|
||||
sort_function = praw.models.Subreddit.hot
|
||||
return sort_function
|
||||
|
||||
def get_multireddits(self) -> list[Iterator]:
|
||||
if self.args.multireddit:
|
||||
if len(self.args.user) != 1:
|
||||
logger.error("Only 1 user can be supplied when retrieving from multireddits")
|
||||
return []
|
||||
out = []
|
||||
for multi in self.split_args_input(self.args.multireddit):
|
||||
try:
|
||||
multi = self.reddit_instance.multireddit(redditor=self.args.user[0], name=multi)
|
||||
if not multi.subreddits:
|
||||
raise errors.BulkDownloaderException
|
||||
out.append(self.create_filtered_listing_generator(multi))
|
||||
logger.debug(f"Added submissions from multireddit {multi}")
|
||||
except (errors.BulkDownloaderException, praw.exceptions.PRAWException, prawcore.PrawcoreException) as e:
|
||||
logger.error(f"Failed to get submissions for multireddit {multi}: {e}")
|
||||
return out
|
||||
else:
|
||||
return []
|
||||
|
||||
def create_filtered_listing_generator(self, reddit_source) -> Iterator:
|
||||
sort_function = self.determine_sort_function()
|
||||
if self.sort_filter in (RedditTypes.SortType.TOP, RedditTypes.SortType.CONTROVERSIAL):
|
||||
return sort_function(reddit_source, limit=self.args.limit, time_filter=self.time_filter.value)
|
||||
else:
|
||||
return sort_function(reddit_source, limit=self.args.limit)
|
||||
|
||||
def get_user_data(self) -> list[Iterator]:
|
||||
if any([self.args.submitted, self.args.upvoted, self.args.saved]):
|
||||
if not self.args.user:
|
||||
logger.warning("At least one user must be supplied to download user data")
|
||||
return []
|
||||
generators = []
|
||||
for user in self.args.user:
|
||||
try:
|
||||
try:
|
||||
self.check_user_existence(user)
|
||||
except errors.BulkDownloaderException as e:
|
||||
logger.error(e)
|
||||
continue
|
||||
if self.args.submitted:
|
||||
logger.debug(f"Retrieving submitted posts of user {user}")
|
||||
generators.append(
|
||||
self.create_filtered_listing_generator(
|
||||
self.reddit_instance.redditor(user).submissions,
|
||||
)
|
||||
)
|
||||
if not self.authenticated and any((self.args.upvoted, self.args.saved)):
|
||||
logger.warning("Accessing user lists requires authentication")
|
||||
else:
|
||||
if self.args.upvoted:
|
||||
logger.debug(f"Retrieving upvoted posts of user {user}")
|
||||
generators.append(self.reddit_instance.redditor(user).upvoted(limit=self.args.limit))
|
||||
if self.args.saved:
|
||||
logger.debug(f"Retrieving saved posts of user {user}")
|
||||
generators.append(self.reddit_instance.redditor(user).saved(limit=self.args.limit))
|
||||
except prawcore.PrawcoreException as e:
|
||||
logger.error(f"User {user} failed to be retrieved due to a PRAW exception: {e}")
|
||||
logger.debug("Waiting 60 seconds to continue")
|
||||
sleep(60)
|
||||
return generators
|
||||
else:
|
||||
return []
|
||||
|
||||
def check_user_existence(self, name: str):
|
||||
user = self.reddit_instance.redditor(name=name)
|
||||
try:
|
||||
if user.id:
|
||||
return
|
||||
except prawcore.exceptions.NotFound:
|
||||
raise errors.BulkDownloaderException(f"Could not find user {name}")
|
||||
except AttributeError:
|
||||
if hasattr(user, "is_suspended"):
|
||||
raise errors.BulkDownloaderException(f"User {name} is banned")
|
||||
|
||||
def create_file_name_formatter(self) -> FileNameFormatter:
|
||||
return FileNameFormatter(
|
||||
self.args.file_scheme, self.args.folder_scheme, self.args.time_format, self.args.filename_restriction_scheme
|
||||
)
|
||||
|
||||
def create_time_filter(self) -> RedditTypes.TimeType:
|
||||
try:
|
||||
return RedditTypes.TimeType[self.args.time.upper()]
|
||||
except (KeyError, AttributeError):
|
||||
return RedditTypes.TimeType.ALL
|
||||
|
||||
def create_sort_filter(self) -> RedditTypes.SortType:
|
||||
try:
|
||||
return RedditTypes.SortType[self.args.sort.upper()]
|
||||
except (KeyError, AttributeError):
|
||||
return RedditTypes.SortType.HOT
|
||||
|
||||
def create_download_filter(self) -> DownloadFilter:
|
||||
return DownloadFilter(self.args.skip, self.args.skip_domain)
|
||||
|
||||
def create_authenticator(self) -> SiteAuthenticator:
|
||||
return SiteAuthenticator(self.cfg_parser)
|
||||
|
||||
@abstractmethod
|
||||
def download(self):
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def check_subreddit_status(subreddit: praw.models.Subreddit):
|
||||
if subreddit.display_name in ("all", "friends"):
|
||||
return
|
||||
try:
|
||||
assert subreddit.id
|
||||
except prawcore.NotFound:
|
||||
raise errors.BulkDownloaderException(f"Source {subreddit.display_name} cannot be found")
|
||||
except prawcore.Redirect:
|
||||
raise errors.BulkDownloaderException(f"Source {subreddit.display_name} does not exist")
|
||||
except prawcore.Forbidden:
|
||||
raise errors.BulkDownloaderException(f"Source {subreddit.display_name} is private and cannot be scraped")
|
||||
|
||||
@staticmethod
|
||||
def read_id_files(file_locations: list[str]) -> set[str]:
|
||||
out = []
|
||||
for id_file in file_locations:
|
||||
id_file = Path(id_file).resolve().expanduser()
|
||||
if not id_file.exists():
|
||||
logger.warning(f"ID file at {id_file} does not exist")
|
||||
continue
|
||||
with id_file.open("r") as file:
|
||||
for line in file:
|
||||
out.append(line.strip())
|
||||
return set(out)
|
7
bdfr/default_config.cfg
Normal file
7
bdfr/default_config.cfg
Normal file
|
@ -0,0 +1,7 @@
|
|||
[DEFAULT]
|
||||
client_id = U-6gk4ZCh3IeNQ
|
||||
client_secret = 7CZHY6AmKweZME5s50SfDGylaPg
|
||||
scopes = identity, history, read, save, mysubreddits
|
||||
backup_log_count = 3
|
||||
max_wait_time = 120
|
||||
time_format = ISO
|
53
bdfr/download_filter.py
Normal file
53
bdfr/download_filter.py
Normal file
|
@ -0,0 +1,53 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import logging
|
||||
import re
|
||||
|
||||
from bdfr.resource import Resource
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class DownloadFilter:
|
||||
def __init__(self, excluded_extensions: list[str] = None, excluded_domains: list[str] = None):
|
||||
self.excluded_extensions = excluded_extensions
|
||||
self.excluded_domains = excluded_domains
|
||||
|
||||
def check_url(self, url: str) -> bool:
|
||||
"""Return whether a URL is allowed or not"""
|
||||
if not self._check_extension(url):
|
||||
return False
|
||||
elif not self._check_domain(url):
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
|
||||
def check_resource(self, res: Resource) -> bool:
|
||||
if not self._check_extension(res.extension):
|
||||
return False
|
||||
elif not self._check_domain(res.url):
|
||||
return False
|
||||
return True
|
||||
|
||||
def _check_extension(self, resource_extension: str) -> bool:
|
||||
if not self.excluded_extensions:
|
||||
return True
|
||||
combined_extensions = "|".join(self.excluded_extensions)
|
||||
pattern = re.compile(r".*({})$".format(combined_extensions))
|
||||
if re.match(pattern, resource_extension):
|
||||
logger.log(9, f'Url "{resource_extension}" matched with "{pattern}"')
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
|
||||
def _check_domain(self, url: str) -> bool:
|
||||
if not self.excluded_domains:
|
||||
return True
|
||||
combined_domains = "|".join(self.excluded_domains)
|
||||
pattern = re.compile(r"https?://.*({}).*".format(combined_domains))
|
||||
if re.match(pattern, url):
|
||||
logger.log(9, f'Url "{url}" matched with "{pattern}"')
|
||||
return False
|
||||
else:
|
||||
return True
|
168
bdfr/downloader.py
Normal file
168
bdfr/downloader.py
Normal file
|
@ -0,0 +1,168 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import hashlib
|
||||
import logging.handlers
|
||||
import os
|
||||
import time
|
||||
from collections.abc import Iterable
|
||||
from datetime import datetime
|
||||
from multiprocessing import Pool
|
||||
from pathlib import Path
|
||||
from time import sleep
|
||||
|
||||
import praw
|
||||
import praw.exceptions
|
||||
import praw.models
|
||||
import prawcore
|
||||
|
||||
from bdfr import exceptions as errors
|
||||
from bdfr.configuration import Configuration
|
||||
from bdfr.connector import RedditConnector
|
||||
from bdfr.site_downloaders.download_factory import DownloadFactory
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _calc_hash(existing_file: Path):
|
||||
chunk_size = 1024 * 1024
|
||||
md5_hash = hashlib.md5()
|
||||
with existing_file.open("rb") as file:
|
||||
chunk = file.read(chunk_size)
|
||||
while chunk:
|
||||
md5_hash.update(chunk)
|
||||
chunk = file.read(chunk_size)
|
||||
file_hash = md5_hash.hexdigest()
|
||||
return existing_file, file_hash
|
||||
|
||||
|
||||
class RedditDownloader(RedditConnector):
|
||||
def __init__(self, args: Configuration, logging_handlers: Iterable[logging.Handler] = ()):
|
||||
super(RedditDownloader, self).__init__(args, logging_handlers)
|
||||
if self.args.search_existing:
|
||||
self.master_hash_list = self.scan_existing_files(self.download_directory)
|
||||
|
||||
def download(self):
|
||||
for generator in self.reddit_lists:
|
||||
try:
|
||||
for submission in generator:
|
||||
try:
|
||||
self._download_submission(submission)
|
||||
except prawcore.PrawcoreException as e:
|
||||
logger.error(f"Submission {submission.id} failed to download due to a PRAW exception: {e}")
|
||||
except prawcore.PrawcoreException as e:
|
||||
logger.error(f"The submission after {submission.id} failed to download due to a PRAW exception: {e}")
|
||||
logger.debug("Waiting 60 seconds to continue")
|
||||
sleep(60)
|
||||
|
||||
def _download_submission(self, submission: praw.models.Submission):
|
||||
if submission.id in self.excluded_submission_ids:
|
||||
logger.debug(f"Object {submission.id} in exclusion list, skipping")
|
||||
return
|
||||
elif submission.subreddit.display_name.lower() in self.args.skip_subreddit:
|
||||
logger.debug(f"Submission {submission.id} in {submission.subreddit.display_name} in skip list")
|
||||
return
|
||||
elif (submission.author and submission.author.name in self.args.ignore_user) or (
|
||||
submission.author is None and "DELETED" in self.args.ignore_user
|
||||
):
|
||||
logger.debug(
|
||||
f"Submission {submission.id} in {submission.subreddit.display_name} skipped"
|
||||
f' due to {submission.author.name if submission.author else "DELETED"} being an ignored user'
|
||||
)
|
||||
return
|
||||
elif self.args.min_score and submission.score < self.args.min_score:
|
||||
logger.debug(
|
||||
f"Submission {submission.id} filtered due to score {submission.score} < [{self.args.min_score}]"
|
||||
)
|
||||
return
|
||||
elif self.args.max_score and self.args.max_score < submission.score:
|
||||
logger.debug(
|
||||
f"Submission {submission.id} filtered due to score {submission.score} > [{self.args.max_score}]"
|
||||
)
|
||||
return
|
||||
elif (self.args.min_score_ratio and submission.upvote_ratio < self.args.min_score_ratio) or (
|
||||
self.args.max_score_ratio and self.args.max_score_ratio < submission.upvote_ratio
|
||||
):
|
||||
logger.debug(f"Submission {submission.id} filtered due to score ratio ({submission.upvote_ratio})")
|
||||
return
|
||||
elif not isinstance(submission, praw.models.Submission):
|
||||
logger.warning(f"{submission.id} is not a submission")
|
||||
return
|
||||
elif not self.download_filter.check_url(submission.url):
|
||||
logger.debug(f"Submission {submission.id} filtered due to URL {submission.url}")
|
||||
return
|
||||
|
||||
logger.debug(f"Attempting to download submission {submission.id}")
|
||||
try:
|
||||
downloader_class = DownloadFactory.pull_lever(submission.url)
|
||||
downloader = downloader_class(submission)
|
||||
logger.debug(f"Using {downloader_class.__name__} with url {submission.url}")
|
||||
except errors.NotADownloadableLinkError as e:
|
||||
logger.error(f"Could not download submission {submission.id}: {e}")
|
||||
return
|
||||
if downloader_class.__name__.lower() in self.args.disable_module:
|
||||
logger.debug(f"Submission {submission.id} skipped due to disabled module {downloader_class.__name__}")
|
||||
return
|
||||
try:
|
||||
content = downloader.find_resources(self.authenticator)
|
||||
except errors.SiteDownloaderError as e:
|
||||
logger.error(f"Site {downloader_class.__name__} failed to download submission {submission.id}: {e}")
|
||||
return
|
||||
for destination, res in self.file_name_formatter.format_resource_paths(content, self.download_directory):
|
||||
if destination.exists():
|
||||
logger.debug(f"File {destination} from submission {submission.id} already exists, continuing")
|
||||
continue
|
||||
elif not self.download_filter.check_resource(res):
|
||||
logger.debug(f"Download filter removed {submission.id} file with URL {submission.url}")
|
||||
continue
|
||||
try:
|
||||
res.download({"max_wait_time": self.args.max_wait_time})
|
||||
except errors.BulkDownloaderException as e:
|
||||
logger.error(
|
||||
f"Failed to download resource {res.url} in submission {submission.id} "
|
||||
f"with downloader {downloader_class.__name__}: {e}"
|
||||
)
|
||||
return
|
||||
resource_hash = res.hash.hexdigest()
|
||||
destination.parent.mkdir(parents=True, exist_ok=True)
|
||||
if resource_hash in self.master_hash_list:
|
||||
if self.args.no_dupes:
|
||||
logger.info(f"Resource hash {resource_hash} from submission {submission.id} downloaded elsewhere")
|
||||
return
|
||||
elif self.args.make_hard_links:
|
||||
try:
|
||||
destination.hardlink_to(self.master_hash_list[resource_hash])
|
||||
except AttributeError:
|
||||
self.master_hash_list[resource_hash].link_to(destination)
|
||||
logger.info(
|
||||
f"Hard link made linking {destination} to {self.master_hash_list[resource_hash]}"
|
||||
f" in submission {submission.id}"
|
||||
)
|
||||
return
|
||||
try:
|
||||
with destination.open("wb") as file:
|
||||
file.write(res.content)
|
||||
logger.debug(f"Written file to {destination}")
|
||||
except OSError as e:
|
||||
logger.exception(e)
|
||||
logger.error(f"Failed to write file in submission {submission.id} to {destination}: {e}")
|
||||
return
|
||||
creation_time = time.mktime(datetime.fromtimestamp(submission.created_utc).timetuple())
|
||||
os.utime(destination, (creation_time, creation_time))
|
||||
self.master_hash_list[resource_hash] = destination
|
||||
logger.debug(f"Hash added to master list: {resource_hash}")
|
||||
logger.info(f"Downloaded submission {submission.id} from {submission.subreddit.display_name}")
|
||||
|
||||
@staticmethod
|
||||
def scan_existing_files(directory: Path) -> dict[str, Path]:
|
||||
files = []
|
||||
for (dirpath, _dirnames, filenames) in os.walk(directory):
|
||||
files.extend([Path(dirpath, file) for file in filenames])
|
||||
logger.info(f"Calculating hashes for {len(files)} files")
|
||||
|
||||
pool = Pool(15)
|
||||
results = pool.map(_calc_hash, files)
|
||||
pool.close()
|
||||
|
||||
hash_list = {res[1]: res[0] for res in results}
|
||||
return hash_list
|
30
bdfr/exceptions.py
Normal file
30
bdfr/exceptions.py
Normal file
|
@ -0,0 +1,30 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
|
||||
class BulkDownloaderException(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class RedditUserError(BulkDownloaderException):
|
||||
pass
|
||||
|
||||
|
||||
class RedditAuthenticationError(RedditUserError):
|
||||
pass
|
||||
|
||||
|
||||
class ArchiverError(BulkDownloaderException):
|
||||
pass
|
||||
|
||||
|
||||
class SiteDownloaderError(BulkDownloaderException):
|
||||
pass
|
||||
|
||||
|
||||
class NotADownloadableLinkError(SiteDownloaderError):
|
||||
pass
|
||||
|
||||
|
||||
class ResourceNotFound(SiteDownloaderError):
|
||||
pass
|
227
bdfr/file_name_formatter.py
Normal file
227
bdfr/file_name_formatter.py
Normal file
|
@ -0,0 +1,227 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import datetime
|
||||
import logging
|
||||
import platform
|
||||
import re
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
from typing import Optional, Union
|
||||
|
||||
from praw.models import Comment, Submission
|
||||
|
||||
from bdfr.exceptions import BulkDownloaderException
|
||||
from bdfr.resource import Resource
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class FileNameFormatter:
|
||||
key_terms = (
|
||||
"date",
|
||||
"flair",
|
||||
"postid",
|
||||
"redditor",
|
||||
"subreddit",
|
||||
"title",
|
||||
"upvotes",
|
||||
)
|
||||
WINDOWS_MAX_PATH_LENGTH = 260
|
||||
LINUX_MAX_PATH_LENGTH = 4096
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
file_format_string: str,
|
||||
directory_format_string: str,
|
||||
time_format_string: str,
|
||||
restriction_scheme: Optional[str] = None,
|
||||
):
|
||||
if not self.validate_string(file_format_string):
|
||||
raise BulkDownloaderException(f'"{file_format_string}" is not a valid format string')
|
||||
self.file_format_string = file_format_string
|
||||
self.directory_format_string: list[str] = directory_format_string.split("/")
|
||||
self.time_format_string = time_format_string
|
||||
self.restiction_scheme = restriction_scheme.lower().strip() if restriction_scheme else None
|
||||
if self.restiction_scheme == "windows":
|
||||
self.max_path = self.WINDOWS_MAX_PATH_LENGTH
|
||||
else:
|
||||
self.max_path = self.find_max_path_length()
|
||||
|
||||
def _format_name(self, submission: Union[Comment, Submission], format_string: str) -> str:
|
||||
if isinstance(submission, Submission):
|
||||
attributes = self._generate_name_dict_from_submission(submission)
|
||||
elif isinstance(submission, Comment):
|
||||
attributes = self._generate_name_dict_from_comment(submission)
|
||||
else:
|
||||
raise BulkDownloaderException(f"Cannot name object {type(submission).__name__}")
|
||||
result = format_string
|
||||
for key in attributes.keys():
|
||||
if re.search(rf"(?i).*{{{key}}}.*", result):
|
||||
key_value = str(attributes.get(key, "unknown"))
|
||||
key_value = FileNameFormatter._convert_unicode_escapes(key_value)
|
||||
key_value = key_value.replace("\\", "\\\\")
|
||||
result = re.sub(rf"(?i){{{key}}}", key_value, result)
|
||||
|
||||
result = result.replace("/", "")
|
||||
|
||||
if self.restiction_scheme is None:
|
||||
if platform.system() == "Windows":
|
||||
result = FileNameFormatter._format_for_windows(result)
|
||||
elif self.restiction_scheme == "windows":
|
||||
logger.debug("Forcing Windows-compatible filenames")
|
||||
result = FileNameFormatter._format_for_windows(result)
|
||||
return result
|
||||
|
||||
@staticmethod
|
||||
def _convert_unicode_escapes(in_string: str) -> str:
|
||||
pattern = re.compile(r"(\\u\d{4})")
|
||||
matches = re.search(pattern, in_string)
|
||||
if matches:
|
||||
for match in matches.groups():
|
||||
converted_match = bytes(match, "utf-8").decode("unicode-escape")
|
||||
in_string = in_string.replace(match, converted_match)
|
||||
return in_string
|
||||
|
||||
def _generate_name_dict_from_submission(self, submission: Submission) -> dict:
|
||||
submission_attributes = {
|
||||
"title": submission.title,
|
||||
"subreddit": submission.subreddit.display_name,
|
||||
"redditor": submission.author.name if submission.author else "DELETED",
|
||||
"postid": submission.id,
|
||||
"upvotes": submission.score,
|
||||
"flair": submission.link_flair_text,
|
||||
"date": self._convert_timestamp(submission.created_utc),
|
||||
}
|
||||
return submission_attributes
|
||||
|
||||
def _convert_timestamp(self, timestamp: float) -> str:
|
||||
input_time = datetime.datetime.fromtimestamp(timestamp)
|
||||
if self.time_format_string.upper().strip() == "ISO":
|
||||
return input_time.isoformat()
|
||||
else:
|
||||
return input_time.strftime(self.time_format_string)
|
||||
|
||||
def _generate_name_dict_from_comment(self, comment: Comment) -> dict:
|
||||
comment_attributes = {
|
||||
"title": comment.submission.title,
|
||||
"subreddit": comment.subreddit.display_name,
|
||||
"redditor": comment.author.name if comment.author else "DELETED",
|
||||
"postid": comment.id,
|
||||
"upvotes": comment.score,
|
||||
"flair": "",
|
||||
"date": self._convert_timestamp(comment.created_utc),
|
||||
}
|
||||
return comment_attributes
|
||||
|
||||
def format_path(
|
||||
self,
|
||||
resource: Resource,
|
||||
destination_directory: Path,
|
||||
index: Optional[int] = None,
|
||||
) -> Path:
|
||||
subfolder = Path(
|
||||
destination_directory,
|
||||
*[self._format_name(resource.source_submission, part) for part in self.directory_format_string],
|
||||
)
|
||||
index = f"_{index}" if index else ""
|
||||
if not resource.extension:
|
||||
raise BulkDownloaderException(f"Resource from {resource.url} has no extension")
|
||||
file_name = str(self._format_name(resource.source_submission, self.file_format_string))
|
||||
|
||||
file_name = re.sub(r"\n", " ", file_name)
|
||||
|
||||
if not re.match(r".*\.$", file_name) and not re.match(r"^\..*", resource.extension):
|
||||
ending = index + "." + resource.extension
|
||||
else:
|
||||
ending = index + resource.extension
|
||||
|
||||
try:
|
||||
file_path = self.limit_file_name_length(file_name, ending, subfolder)
|
||||
except TypeError:
|
||||
raise BulkDownloaderException(f"Could not determine path name: {subfolder}, {index}, {resource.extension}")
|
||||
return file_path
|
||||
|
||||
def limit_file_name_length(self, filename: str, ending: str, root: Path) -> Path:
|
||||
root = root.resolve().expanduser()
|
||||
possible_id = re.search(r"((?:_\w{6})?$)", filename)
|
||||
if possible_id:
|
||||
ending = possible_id.group(1) + ending
|
||||
filename = filename[: possible_id.start()]
|
||||
max_path = self.max_path
|
||||
max_file_part_length_chars = 255 - len(ending)
|
||||
max_file_part_length_bytes = 255 - len(ending.encode("utf-8"))
|
||||
max_path_length = max_path - len(ending) - len(str(root)) - 1
|
||||
|
||||
out = Path(root, filename + ending)
|
||||
while any(
|
||||
[
|
||||
len(filename) > max_file_part_length_chars,
|
||||
len(filename.encode("utf-8")) > max_file_part_length_bytes,
|
||||
len(str(out)) > max_path_length,
|
||||
]
|
||||
):
|
||||
filename = filename[:-1]
|
||||
out = Path(root, filename + ending)
|
||||
|
||||
return out
|
||||
|
||||
@staticmethod
|
||||
def find_max_path_length() -> int:
|
||||
try:
|
||||
return int(subprocess.check_output(["getconf", "PATH_MAX", "/"]))
|
||||
except (ValueError, subprocess.CalledProcessError, OSError):
|
||||
if platform.system() == "Windows":
|
||||
return FileNameFormatter.WINDOWS_MAX_PATH_LENGTH
|
||||
else:
|
||||
return FileNameFormatter.LINUX_MAX_PATH_LENGTH
|
||||
|
||||
def format_resource_paths(
|
||||
self,
|
||||
resources: list[Resource],
|
||||
destination_directory: Path,
|
||||
) -> list[tuple[Path, Resource]]:
|
||||
out = []
|
||||
if len(resources) == 1:
|
||||
try:
|
||||
out.append((self.format_path(resources[0], destination_directory, None), resources[0]))
|
||||
except BulkDownloaderException as e:
|
||||
logger.error(f"Could not generate file path for resource {resources[0].url}: {e}")
|
||||
logger.exception("Could not generate file path")
|
||||
else:
|
||||
for i, res in enumerate(resources, start=1):
|
||||
logger.log(9, f"Formatting filename with index {i}")
|
||||
try:
|
||||
out.append((self.format_path(res, destination_directory, i), res))
|
||||
except BulkDownloaderException as e:
|
||||
logger.error(f"Could not generate file path for resource {res.url}: {e}")
|
||||
logger.exception("Could not generate file path")
|
||||
return out
|
||||
|
||||
@staticmethod
|
||||
def validate_string(test_string: str) -> bool:
|
||||
if not test_string:
|
||||
return False
|
||||
result = any([f"{{{key}}}" in test_string.lower() for key in FileNameFormatter.key_terms])
|
||||
if result:
|
||||
if "POSTID" not in test_string:
|
||||
logger.warning(
|
||||
"Some files might not be downloaded due to name conflicts as filenames are"
|
||||
" not guaranteed to be be unique without {POSTID}"
|
||||
)
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
@staticmethod
|
||||
def _format_for_windows(input_string: str) -> str:
|
||||
invalid_characters = r'<>:"\/|?*'
|
||||
for char in invalid_characters:
|
||||
input_string = input_string.replace(char, "")
|
||||
input_string = FileNameFormatter._strip_emojis(input_string)
|
||||
return input_string
|
||||
|
||||
@staticmethod
|
||||
def _strip_emojis(input_string: str) -> str:
|
||||
result = input_string.encode("ascii", errors="ignore").decode("utf-8")
|
||||
return result
|
108
bdfr/oauth2.py
Normal file
108
bdfr/oauth2.py
Normal file
|
@ -0,0 +1,108 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import configparser
|
||||
import logging
|
||||
import random
|
||||
import re
|
||||
import socket
|
||||
from pathlib import Path
|
||||
|
||||
import praw
|
||||
import requests
|
||||
|
||||
from bdfr.exceptions import BulkDownloaderException, RedditAuthenticationError
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class OAuth2Authenticator:
|
||||
def __init__(self, wanted_scopes: set[str], client_id: str, client_secret: str):
|
||||
self._check_scopes(wanted_scopes)
|
||||
self.scopes = wanted_scopes
|
||||
self.client_id = client_id
|
||||
self.client_secret = client_secret
|
||||
|
||||
@staticmethod
|
||||
def _check_scopes(wanted_scopes: set[str]):
|
||||
response = requests.get(
|
||||
"https://www.reddit.com/api/v1/scopes.json", headers={"User-Agent": "fetch-scopes test"}
|
||||
)
|
||||
known_scopes = [scope for scope, data in response.json().items()]
|
||||
known_scopes.append("*")
|
||||
for scope in wanted_scopes:
|
||||
if scope not in known_scopes:
|
||||
raise BulkDownloaderException(f"Scope {scope} is not known to reddit")
|
||||
|
||||
@staticmethod
|
||||
def split_scopes(scopes: str) -> set[str]:
|
||||
scopes = re.split(r"[,: ]+", scopes)
|
||||
return set(scopes)
|
||||
|
||||
def retrieve_new_token(self) -> str:
|
||||
reddit = praw.Reddit(
|
||||
redirect_uri="http://localhost:7634",
|
||||
user_agent="obtain_refresh_token for BDFR",
|
||||
client_id=self.client_id,
|
||||
client_secret=self.client_secret,
|
||||
)
|
||||
state = str(random.randint(0, 65000))
|
||||
url = reddit.auth.url(self.scopes, state, "permanent")
|
||||
logger.warning("Authentication action required before the program can proceed")
|
||||
logger.warning(f"Authenticate at {url}")
|
||||
|
||||
client = self.receive_connection()
|
||||
data = client.recv(1024).decode("utf-8")
|
||||
param_tokens = data.split(" ", 2)[1].split("?", 1)[1].split("&")
|
||||
params = {key: value for (key, value) in [token.split("=") for token in param_tokens]}
|
||||
|
||||
if state != params["state"]:
|
||||
self.send_message(client)
|
||||
raise RedditAuthenticationError(f'State mismatch in OAuth2. Expected: {state} Received: {params["state"]}')
|
||||
elif "error" in params:
|
||||
self.send_message(client)
|
||||
raise RedditAuthenticationError(f'Error in OAuth2: {params["error"]}')
|
||||
|
||||
self.send_message(client, "<script>alert('You can go back to terminal window now.')</script>")
|
||||
refresh_token = reddit.auth.authorize(params["code"])
|
||||
return refresh_token
|
||||
|
||||
@staticmethod
|
||||
def receive_connection() -> socket.socket:
|
||||
server = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
||||
server.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
|
||||
server.bind(("0.0.0.0", 7634))
|
||||
logger.log(9, "Server listening on 0.0.0.0:7634")
|
||||
|
||||
server.listen(1)
|
||||
client = server.accept()[0]
|
||||
server.close()
|
||||
logger.log(9, "Server closed")
|
||||
|
||||
return client
|
||||
|
||||
@staticmethod
|
||||
def send_message(client: socket.socket, message: str = ""):
|
||||
client.send(f"HTTP/1.1 200 OK\r\n\r\n{message}".encode("utf-8"))
|
||||
client.close()
|
||||
|
||||
|
||||
class OAuth2TokenManager(praw.reddit.BaseTokenManager):
|
||||
def __init__(self, config: configparser.ConfigParser, config_location: Path):
|
||||
super(OAuth2TokenManager, self).__init__()
|
||||
self.config = config
|
||||
self.config_location = config_location
|
||||
|
||||
def pre_refresh_callback(self, authorizer: praw.reddit.Authorizer):
|
||||
if authorizer.refresh_token is None:
|
||||
if self.config.has_option("DEFAULT", "user_token"):
|
||||
authorizer.refresh_token = self.config.get("DEFAULT", "user_token")
|
||||
logger.log(9, "Loaded OAuth2 token for authoriser")
|
||||
else:
|
||||
raise RedditAuthenticationError("No auth token loaded in configuration")
|
||||
|
||||
def post_refresh_callback(self, authorizer: praw.reddit.Authorizer):
|
||||
self.config.set("DEFAULT", "user_token", authorizer.refresh_token)
|
||||
with Path(self.config_location).open(mode="w") as file:
|
||||
self.config.write(file, True)
|
||||
logger.log(9, f"Written OAuth2 token from authoriser to {self.config_location}")
|
87
bdfr/resource.py
Normal file
87
bdfr/resource.py
Normal file
|
@ -0,0 +1,87 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import hashlib
|
||||
import logging
|
||||
import re
|
||||
import time
|
||||
import urllib.parse
|
||||
from collections.abc import Callable
|
||||
from typing import Optional
|
||||
|
||||
import _hashlib
|
||||
import requests
|
||||
from praw.models import Submission
|
||||
|
||||
from bdfr.exceptions import BulkDownloaderException
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class Resource:
|
||||
def __init__(self, source_submission: Submission, url: str, download_function: Callable, extension: str = None):
|
||||
self.source_submission = source_submission
|
||||
self.content: Optional[bytes] = None
|
||||
self.url = url
|
||||
self.hash: Optional[_hashlib.HASH] = None
|
||||
self.extension = extension
|
||||
self.download_function = download_function
|
||||
if not self.extension:
|
||||
self.extension = self._determine_extension()
|
||||
|
||||
@staticmethod
|
||||
def retry_download(url: str) -> Callable:
|
||||
return lambda global_params: Resource.http_download(url, global_params)
|
||||
|
||||
def download(self, download_parameters: Optional[dict] = None):
|
||||
if download_parameters is None:
|
||||
download_parameters = {}
|
||||
if not self.content:
|
||||
try:
|
||||
content = self.download_function(download_parameters)
|
||||
except requests.exceptions.ConnectionError as e:
|
||||
raise BulkDownloaderException(f"Could not download resource: {e}")
|
||||
except BulkDownloaderException:
|
||||
raise
|
||||
if content:
|
||||
self.content = content
|
||||
if not self.hash and self.content:
|
||||
self.create_hash()
|
||||
|
||||
def create_hash(self):
|
||||
self.hash = hashlib.md5(self.content)
|
||||
|
||||
def _determine_extension(self) -> Optional[str]:
|
||||
extension_pattern = re.compile(r".*(\..{3,5})$")
|
||||
stripped_url = urllib.parse.urlsplit(self.url).path
|
||||
match = re.search(extension_pattern, stripped_url)
|
||||
if match:
|
||||
return match.group(1)
|
||||
|
||||
@staticmethod
|
||||
def http_download(url: str, download_parameters: dict) -> Optional[bytes]:
|
||||
headers = download_parameters.get("headers")
|
||||
current_wait_time = 60
|
||||
if "max_wait_time" in download_parameters:
|
||||
max_wait_time = download_parameters["max_wait_time"]
|
||||
else:
|
||||
max_wait_time = 300
|
||||
while True:
|
||||
try:
|
||||
response = requests.get(url, headers=headers)
|
||||
if re.match(r"^2\d{2}", str(response.status_code)) and response.content:
|
||||
return response.content
|
||||
elif response.status_code in (408, 429):
|
||||
raise requests.exceptions.ConnectionError(f"Response code {response.status_code}")
|
||||
else:
|
||||
raise BulkDownloaderException(
|
||||
f"Unrecoverable error requesting resource: HTTP Code {response.status_code}"
|
||||
)
|
||||
except (requests.exceptions.ConnectionError, requests.exceptions.ChunkedEncodingError) as e:
|
||||
logger.warning(f"Error occured downloading from {url}, waiting {current_wait_time} seconds: {e}")
|
||||
time.sleep(current_wait_time)
|
||||
if current_wait_time < max_wait_time:
|
||||
current_wait_time += 60
|
||||
else:
|
||||
logger.error(f"Max wait time exceeded for resource at url {url}")
|
||||
raise
|
9
bdfr/site_authenticator.py
Normal file
9
bdfr/site_authenticator.py
Normal file
|
@ -0,0 +1,9 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import configparser
|
||||
|
||||
|
||||
class SiteAuthenticator:
|
||||
def __init__(self, cfg: configparser.ConfigParser):
|
||||
self.imgur_authentication = None
|
2
bdfr/site_downloaders/__init__.py
Normal file
2
bdfr/site_downloaders/__init__.py
Normal file
|
@ -0,0 +1,2 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
37
bdfr/site_downloaders/base_downloader.py
Normal file
37
bdfr/site_downloaders/base_downloader.py
Normal file
|
@ -0,0 +1,37 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import logging
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Optional
|
||||
|
||||
import requests
|
||||
from praw.models import Submission
|
||||
|
||||
from bdfr.exceptions import ResourceNotFound, SiteDownloaderError
|
||||
from bdfr.resource import Resource
|
||||
from bdfr.site_authenticator import SiteAuthenticator
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class BaseDownloader(ABC):
|
||||
def __init__(self, post: Submission, typical_extension: Optional[str] = None):
|
||||
self.post = post
|
||||
self.typical_extension = typical_extension
|
||||
|
||||
@abstractmethod
|
||||
def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]:
|
||||
"""Return list of all un-downloaded Resources from submission"""
|
||||
raise NotImplementedError
|
||||
|
||||
@staticmethod
|
||||
def retrieve_url(url: str, cookies: dict = None, headers: dict = None) -> requests.Response:
|
||||
try:
|
||||
res = requests.get(url, cookies=cookies, headers=headers)
|
||||
except requests.exceptions.RequestException as e:
|
||||
logger.exception(e)
|
||||
raise SiteDownloaderError(f"Failed to get page {url}")
|
||||
if res.status_code != 200:
|
||||
raise ResourceNotFound(f"Server responded with {res.status_code} to {url}")
|
||||
return res
|
22
bdfr/site_downloaders/delay_for_reddit.py
Normal file
22
bdfr/site_downloaders/delay_for_reddit.py
Normal file
|
@ -0,0 +1,22 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import logging
|
||||
from typing import Optional
|
||||
|
||||
from praw.models import Submission
|
||||
|
||||
from bdfr.resource import Resource
|
||||
from bdfr.site_authenticator import SiteAuthenticator
|
||||
from bdfr.site_downloaders.base_downloader import BaseDownloader
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class DelayForReddit(BaseDownloader):
|
||||
def __init__(self, post: Submission):
|
||||
super().__init__(post)
|
||||
|
||||
def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]:
|
||||
media = DelayForReddit.retrieve_url(self.post.url)
|
||||
return [Resource(self.post, media.url, Resource.retry_download(media.url))]
|
18
bdfr/site_downloaders/direct.py
Normal file
18
bdfr/site_downloaders/direct.py
Normal file
|
@ -0,0 +1,18 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from typing import Optional
|
||||
|
||||
from praw.models import Submission
|
||||
|
||||
from bdfr.resource import Resource
|
||||
from bdfr.site_authenticator import SiteAuthenticator
|
||||
from bdfr.site_downloaders.base_downloader import BaseDownloader
|
||||
|
||||
|
||||
class Direct(BaseDownloader):
|
||||
def __init__(self, post: Submission):
|
||||
super().__init__(post)
|
||||
|
||||
def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]:
|
||||
return [Resource(self.post, self.post.url, Resource.retry_download(self.post.url))]
|
89
bdfr/site_downloaders/download_factory.py
Normal file
89
bdfr/site_downloaders/download_factory.py
Normal file
|
@ -0,0 +1,89 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import re
|
||||
import urllib.parse
|
||||
|
||||
from bdfr.exceptions import NotADownloadableLinkError
|
||||
from bdfr.site_downloaders.base_downloader import BaseDownloader
|
||||
from bdfr.site_downloaders.delay_for_reddit import DelayForReddit
|
||||
from bdfr.site_downloaders.direct import Direct
|
||||
from bdfr.site_downloaders.erome import Erome
|
||||
from bdfr.site_downloaders.fallback_downloaders.ytdlp_fallback import YtdlpFallback
|
||||
from bdfr.site_downloaders.gallery import Gallery
|
||||
from bdfr.site_downloaders.gfycat import Gfycat
|
||||
from bdfr.site_downloaders.imgur import Imgur
|
||||
from bdfr.site_downloaders.pornhub import PornHub
|
||||
from bdfr.site_downloaders.redgifs import Redgifs
|
||||
from bdfr.site_downloaders.self_post import SelfPost
|
||||
from bdfr.site_downloaders.vidble import Vidble
|
||||
from bdfr.site_downloaders.vreddit import VReddit
|
||||
from bdfr.site_downloaders.youtube import Youtube
|
||||
|
||||
|
||||
class DownloadFactory:
|
||||
@staticmethod
|
||||
def pull_lever(url: str) -> type[BaseDownloader]:
|
||||
sanitised_url = DownloadFactory.sanitise_url(url).lower()
|
||||
if re.match(r"(i\.|m\.|o\.)?imgur", sanitised_url):
|
||||
return Imgur
|
||||
elif re.match(r"(i\.|thumbs\d\.|v\d\.)?(redgifs|gifdeliverynetwork)", sanitised_url):
|
||||
return Redgifs
|
||||
elif re.match(r"(thumbs\.|giant\.)?gfycat\.", sanitised_url):
|
||||
return Gfycat
|
||||
elif re.match(r".*/.*\.[a-zA-Z34]{3,4}(\?[\w;&=]*)?$", sanitised_url) and not DownloadFactory.is_web_resource(
|
||||
sanitised_url
|
||||
):
|
||||
return Direct
|
||||
elif re.match(r"erome\.com.*", sanitised_url):
|
||||
return Erome
|
||||
elif re.match(r"delayforreddit\.com", sanitised_url):
|
||||
return DelayForReddit
|
||||
elif re.match(r"reddit\.com/gallery/.*", sanitised_url):
|
||||
return Gallery
|
||||
elif re.match(r"patreon\.com.*", sanitised_url):
|
||||
return Gallery
|
||||
elif re.match(r"reddit\.com/r/", sanitised_url):
|
||||
return SelfPost
|
||||
elif re.match(r"(m\.)?youtu\.?be", sanitised_url):
|
||||
return Youtube
|
||||
elif re.match(r"i\.redd\.it.*", sanitised_url):
|
||||
return Direct
|
||||
elif re.match(r"v\.redd\.it.*", sanitised_url):
|
||||
return VReddit
|
||||
elif re.match(r"pornhub\.com.*", sanitised_url):
|
||||
return PornHub
|
||||
elif re.match(r"vidble\.com", sanitised_url):
|
||||
return Vidble
|
||||
elif YtdlpFallback.can_handle_link(sanitised_url):
|
||||
return YtdlpFallback
|
||||
else:
|
||||
raise NotADownloadableLinkError(f"No downloader module exists for url {url}")
|
||||
|
||||
@staticmethod
|
||||
def sanitise_url(url: str) -> str:
|
||||
beginning_regex = re.compile(r"\s*(www\.?)?")
|
||||
split_url = urllib.parse.urlsplit(url)
|
||||
split_url = split_url.netloc + split_url.path
|
||||
split_url = re.sub(beginning_regex, "", split_url)
|
||||
return split_url
|
||||
|
||||
@staticmethod
|
||||
def is_web_resource(url: str) -> bool:
|
||||
web_extensions = (
|
||||
"asp",
|
||||
"aspx",
|
||||
"cfm",
|
||||
"cfml",
|
||||
"css",
|
||||
"htm",
|
||||
"html",
|
||||
"js",
|
||||
"php",
|
||||
"php3",
|
||||
"xhtml",
|
||||
)
|
||||
if re.match(rf'(?i).*/.*\.({"|".join(web_extensions)})$', url):
|
||||
return True
|
||||
else:
|
||||
return False
|
58
bdfr/site_downloaders/erome.py
Normal file
58
bdfr/site_downloaders/erome.py
Normal file
|
@ -0,0 +1,58 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import logging
|
||||
import re
|
||||
from collections.abc import Callable
|
||||
from typing import Optional
|
||||
|
||||
import bs4
|
||||
from praw.models import Submission
|
||||
|
||||
from bdfr.exceptions import SiteDownloaderError
|
||||
from bdfr.resource import Resource
|
||||
from bdfr.site_authenticator import SiteAuthenticator
|
||||
from bdfr.site_downloaders.base_downloader import BaseDownloader
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class Erome(BaseDownloader):
|
||||
def __init__(self, post: Submission):
|
||||
super().__init__(post)
|
||||
|
||||
def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]:
|
||||
links = self._get_links(self.post.url)
|
||||
|
||||
if not links:
|
||||
raise SiteDownloaderError("Erome parser could not find any links")
|
||||
|
||||
out = []
|
||||
for link in links:
|
||||
if not re.match(r"https?://.*", link):
|
||||
link = "https://" + link
|
||||
out.append(Resource(self.post, link, self.erome_download(link)))
|
||||
return out
|
||||
|
||||
@staticmethod
|
||||
def _get_links(url: str) -> set[str]:
|
||||
page = Erome.retrieve_url(url)
|
||||
soup = bs4.BeautifulSoup(page.text, "html.parser")
|
||||
front_images = soup.find_all("img", attrs={"class": "lasyload"})
|
||||
out = [im.get("data-src") for im in front_images]
|
||||
|
||||
videos = soup.find_all("source")
|
||||
out.extend([vid.get("src") for vid in videos])
|
||||
|
||||
return set(out)
|
||||
|
||||
@staticmethod
|
||||
def erome_download(url: str) -> Callable:
|
||||
download_parameters = {
|
||||
"headers": {
|
||||
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)"
|
||||
" Chrome/88.0.4324.104 Safari/537.36",
|
||||
"Referer": "https://www.erome.com/",
|
||||
},
|
||||
}
|
||||
return lambda global_params: Resource.http_download(url, global_params | download_parameters)
|
2
bdfr/site_downloaders/fallback_downloaders/__init__.py
Normal file
2
bdfr/site_downloaders/fallback_downloaders/__init__.py
Normal file
|
@ -0,0 +1,2 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
|
@ -0,0 +1,14 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
|
||||
from bdfr.site_downloaders.base_downloader import BaseDownloader
|
||||
|
||||
|
||||
class BaseFallbackDownloader(BaseDownloader, ABC):
|
||||
@staticmethod
|
||||
@abstractmethod
|
||||
def can_handle_link(url: str) -> bool:
|
||||
"""Returns whether the fallback downloader can download this link"""
|
||||
raise NotImplementedError
|
38
bdfr/site_downloaders/fallback_downloaders/ytdlp_fallback.py
Normal file
38
bdfr/site_downloaders/fallback_downloaders/ytdlp_fallback.py
Normal file
|
@ -0,0 +1,38 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import logging
|
||||
from typing import Optional
|
||||
|
||||
from praw.models import Submission
|
||||
|
||||
from bdfr.exceptions import NotADownloadableLinkError
|
||||
from bdfr.resource import Resource
|
||||
from bdfr.site_authenticator import SiteAuthenticator
|
||||
from bdfr.site_downloaders.fallback_downloaders.fallback_downloader import BaseFallbackDownloader
|
||||
from bdfr.site_downloaders.youtube import Youtube
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class YtdlpFallback(BaseFallbackDownloader, Youtube):
|
||||
def __init__(self, post: Submission):
|
||||
super(YtdlpFallback, self).__init__(post)
|
||||
|
||||
def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]:
|
||||
out = Resource(
|
||||
self.post,
|
||||
self.post.url,
|
||||
super()._download_video({}),
|
||||
super().get_video_attributes(self.post.url)["ext"],
|
||||
)
|
||||
return [out]
|
||||
|
||||
@staticmethod
|
||||
def can_handle_link(url: str) -> bool:
|
||||
try:
|
||||
attributes = YtdlpFallback.get_video_attributes(url)
|
||||
except NotADownloadableLinkError:
|
||||
return False
|
||||
if attributes:
|
||||
return True
|
49
bdfr/site_downloaders/gallery.py
Normal file
49
bdfr/site_downloaders/gallery.py
Normal file
|
@ -0,0 +1,49 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import logging
|
||||
from typing import Optional
|
||||
|
||||
import requests
|
||||
from praw.models import Submission
|
||||
|
||||
from bdfr.exceptions import SiteDownloaderError
|
||||
from bdfr.resource import Resource
|
||||
from bdfr.site_authenticator import SiteAuthenticator
|
||||
from bdfr.site_downloaders.base_downloader import BaseDownloader
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class Gallery(BaseDownloader):
|
||||
def __init__(self, post: Submission):
|
||||
super().__init__(post)
|
||||
|
||||
def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]:
|
||||
try:
|
||||
image_urls = self._get_links(self.post.gallery_data["items"])
|
||||
except (AttributeError, TypeError):
|
||||
try:
|
||||
image_urls = self._get_links(self.post.crosspost_parent_list[0]["gallery_data"]["items"])
|
||||
except (AttributeError, IndexError, TypeError, KeyError):
|
||||
logger.error(f"Could not find gallery data in submission {self.post.id}")
|
||||
logger.exception("Gallery image find failure")
|
||||
raise SiteDownloaderError("No images found in Reddit gallery")
|
||||
|
||||
if not image_urls:
|
||||
raise SiteDownloaderError("No images found in Reddit gallery")
|
||||
return [Resource(self.post, url, Resource.retry_download(url)) for url in image_urls]
|
||||
|
||||
@staticmethod
|
||||
def _get_links(id_dict: list[dict]) -> list[str]:
|
||||
out = []
|
||||
for item in id_dict:
|
||||
image_id = item["media_id"]
|
||||
possible_extensions = (".jpg", ".png", ".gif", ".gifv", ".jpeg")
|
||||
for extension in possible_extensions:
|
||||
test_url = f"https://i.redd.it/{image_id}{extension}"
|
||||
response = requests.head(test_url)
|
||||
if response.status_code == 200:
|
||||
out.append(test_url)
|
||||
break
|
||||
return out
|
45
bdfr/site_downloaders/gfycat.py
Normal file
45
bdfr/site_downloaders/gfycat.py
Normal file
|
@ -0,0 +1,45 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import json
|
||||
import re
|
||||
from typing import Optional
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
from praw.models import Submission
|
||||
|
||||
from bdfr.exceptions import SiteDownloaderError
|
||||
from bdfr.resource import Resource
|
||||
from bdfr.site_authenticator import SiteAuthenticator
|
||||
from bdfr.site_downloaders.redgifs import Redgifs
|
||||
|
||||
|
||||
class Gfycat(Redgifs):
|
||||
def __init__(self, post: Submission):
|
||||
super().__init__(post)
|
||||
|
||||
def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]:
|
||||
return super().find_resources(authenticator)
|
||||
|
||||
@staticmethod
|
||||
def _get_link(url: str) -> set[str]:
|
||||
gfycat_id = re.match(r".*/(.*?)(?:/?|-.*|\..{3-4})$", url).group(1)
|
||||
url = "https://gfycat.com/" + gfycat_id
|
||||
|
||||
response = Gfycat.retrieve_url(url)
|
||||
if re.search(r"(redgifs|gifdeliverynetwork)", response.url):
|
||||
url = url.lower() # Fixes error with old gfycat/redgifs links
|
||||
return Redgifs._get_link(url)
|
||||
|
||||
soup = BeautifulSoup(response.text, "html.parser")
|
||||
content = soup.find("script", attrs={"data-react-helmet": "true", "type": "application/ld+json"})
|
||||
|
||||
try:
|
||||
out = json.loads(content.contents[0])["video"]["contentUrl"]
|
||||
except (IndexError, KeyError, AttributeError) as e:
|
||||
raise SiteDownloaderError(f"Failed to download Gfycat link {url}: {e}")
|
||||
except json.JSONDecodeError as e:
|
||||
raise SiteDownloaderError(f"Did not receive valid JSON data: {e}")
|
||||
return {
|
||||
out,
|
||||
}
|
65
bdfr/site_downloaders/imgur.py
Normal file
65
bdfr/site_downloaders/imgur.py
Normal file
|
@ -0,0 +1,65 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import json
|
||||
import re
|
||||
from typing import Optional
|
||||
|
||||
from praw.models import Submission
|
||||
|
||||
from bdfr.exceptions import SiteDownloaderError
|
||||
from bdfr.resource import Resource
|
||||
from bdfr.site_authenticator import SiteAuthenticator
|
||||
from bdfr.site_downloaders.base_downloader import BaseDownloader
|
||||
|
||||
|
||||
class Imgur(BaseDownloader):
|
||||
def __init__(self, post: Submission):
|
||||
super().__init__(post)
|
||||
self.raw_data = {}
|
||||
|
||||
def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]:
|
||||
self.raw_data = self._get_data(self.post.url)
|
||||
|
||||
out = []
|
||||
if "is_album" in self.raw_data:
|
||||
for image in self.raw_data["images"]:
|
||||
if "mp4" in image:
|
||||
out.append(Resource(self.post, image["mp4"], Resource.retry_download(image["mp4"])))
|
||||
else:
|
||||
out.append(Resource(self.post, image["link"], Resource.retry_download(image["link"])))
|
||||
else:
|
||||
if "mp4" in self.raw_data:
|
||||
out.append(Resource(self.post, self.raw_data["mp4"], Resource.retry_download(self.raw_data["mp4"])))
|
||||
else:
|
||||
out.append(Resource(self.post, self.raw_data["link"], Resource.retry_download(self.raw_data["link"])))
|
||||
return out
|
||||
|
||||
@staticmethod
|
||||
def _get_data(link: str) -> dict:
|
||||
try:
|
||||
if link.endswith("/"):
|
||||
link = link.removesuffix("/")
|
||||
if re.search(r".*/(.*?)(gallery/|a/)", link):
|
||||
imgur_id = re.match(r".*/(?:gallery/|a/)(.*?)(?:/.*)?$", link).group(1)
|
||||
link = f"https://api.imgur.com/3/album/{imgur_id}"
|
||||
else:
|
||||
imgur_id = re.match(r".*/(.*?)(?:_d)?(?:\..{0,})?$", link).group(1)
|
||||
link = f"https://api.imgur.com/3/image/{imgur_id}"
|
||||
except AttributeError:
|
||||
raise SiteDownloaderError(f"Could not extract Imgur ID from {link}")
|
||||
|
||||
headers = {
|
||||
"referer": "https://imgur.com/",
|
||||
"origin": "https://imgur.com",
|
||||
"content-type": "application/json",
|
||||
"Authorization": "Client-ID 546c25a59c58ad7",
|
||||
}
|
||||
res = Imgur.retrieve_url(link, headers=headers)
|
||||
|
||||
try:
|
||||
image_dict = json.loads(res.text)
|
||||
except json.JSONDecodeError as e:
|
||||
raise SiteDownloaderError(f"Could not parse received response as JSON: {e}")
|
||||
|
||||
return image_dict["data"]
|
37
bdfr/site_downloaders/pornhub.py
Normal file
37
bdfr/site_downloaders/pornhub.py
Normal file
|
@ -0,0 +1,37 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import logging
|
||||
from typing import Optional
|
||||
|
||||
from praw.models import Submission
|
||||
|
||||
from bdfr.exceptions import SiteDownloaderError
|
||||
from bdfr.resource import Resource
|
||||
from bdfr.site_authenticator import SiteAuthenticator
|
||||
from bdfr.site_downloaders.youtube import Youtube
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class PornHub(Youtube):
|
||||
def __init__(self, post: Submission):
|
||||
super().__init__(post)
|
||||
|
||||
def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]:
|
||||
ytdl_options = {
|
||||
"format": "best",
|
||||
"nooverwrites": True,
|
||||
}
|
||||
if video_attributes := super().get_video_attributes(self.post.url):
|
||||
extension = video_attributes["ext"]
|
||||
else:
|
||||
raise SiteDownloaderError()
|
||||
|
||||
out = Resource(
|
||||
self.post,
|
||||
self.post.url,
|
||||
super()._download_video(ytdl_options),
|
||||
extension,
|
||||
)
|
||||
return [out]
|
86
bdfr/site_downloaders/redgifs.py
Normal file
86
bdfr/site_downloaders/redgifs.py
Normal file
|
@ -0,0 +1,86 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import json
|
||||
import re
|
||||
from typing import Optional
|
||||
|
||||
import requests
|
||||
from praw.models import Submission
|
||||
|
||||
from bdfr.exceptions import SiteDownloaderError
|
||||
from bdfr.resource import Resource
|
||||
from bdfr.site_authenticator import SiteAuthenticator
|
||||
from bdfr.site_downloaders.base_downloader import BaseDownloader
|
||||
|
||||
|
||||
class Redgifs(BaseDownloader):
|
||||
def __init__(self, post: Submission):
|
||||
super().__init__(post)
|
||||
|
||||
def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]:
|
||||
media_urls = self._get_link(self.post.url)
|
||||
return [Resource(self.post, m, Resource.retry_download(m), None) for m in media_urls]
|
||||
|
||||
@staticmethod
|
||||
def _get_id(url: str) -> str:
|
||||
try:
|
||||
if url.endswith("/"):
|
||||
url = url.removesuffix("/")
|
||||
redgif_id = re.match(r".*/(.*?)(?:#.*|\?.*|\..{0,})?$", url).group(1).lower()
|
||||
if redgif_id.endswith("-mobile"):
|
||||
redgif_id = redgif_id.removesuffix("-mobile")
|
||||
except AttributeError:
|
||||
raise SiteDownloaderError(f"Could not extract Redgifs ID from {url}")
|
||||
return redgif_id
|
||||
|
||||
@staticmethod
|
||||
def _get_link(url: str) -> set[str]:
|
||||
redgif_id = Redgifs._get_id(url)
|
||||
|
||||
auth_token = json.loads(Redgifs.retrieve_url("https://api.redgifs.com/v2/auth/temporary").text)["token"]
|
||||
if not auth_token:
|
||||
raise SiteDownloaderError("Unable to retrieve Redgifs API token")
|
||||
|
||||
headers = {
|
||||
"referer": "https://www.redgifs.com/",
|
||||
"origin": "https://www.redgifs.com",
|
||||
"content-type": "application/json",
|
||||
"Authorization": f"Bearer {auth_token}",
|
||||
}
|
||||
|
||||
content = Redgifs.retrieve_url(f"https://api.redgifs.com/v2/gifs/{redgif_id}", headers=headers)
|
||||
|
||||
if content is None:
|
||||
raise SiteDownloaderError("Could not read the page source")
|
||||
|
||||
try:
|
||||
response_json = json.loads(content.text)
|
||||
except json.JSONDecodeError as e:
|
||||
raise SiteDownloaderError(f"Received data was not valid JSON: {e}")
|
||||
|
||||
out = set()
|
||||
try:
|
||||
if response_json["gif"]["type"] == 1: # type 1 is a video
|
||||
if requests.get(response_json["gif"]["urls"]["hd"], headers=headers).ok:
|
||||
out.add(response_json["gif"]["urls"]["hd"])
|
||||
else:
|
||||
out.add(response_json["gif"]["urls"]["sd"])
|
||||
elif response_json["gif"]["type"] == 2: # type 2 is an image
|
||||
if response_json["gif"]["gallery"]:
|
||||
content = Redgifs.retrieve_url(
|
||||
f'https://api.redgifs.com/v2/gallery/{response_json["gif"]["gallery"]}'
|
||||
)
|
||||
response_json = json.loads(content.text)
|
||||
out = {p["urls"]["hd"] for p in response_json["gifs"]}
|
||||
else:
|
||||
out.add(response_json["gif"]["urls"]["hd"])
|
||||
else:
|
||||
raise KeyError
|
||||
except (KeyError, AttributeError):
|
||||
raise SiteDownloaderError("Failed to find JSON data in page")
|
||||
|
||||
# Update subdomain if old one is returned
|
||||
out = {re.sub("thumbs2", "thumbs3", link) for link in out}
|
||||
out = {re.sub("thumbs3", "thumbs4", link) for link in out}
|
||||
return out
|
46
bdfr/site_downloaders/self_post.py
Normal file
46
bdfr/site_downloaders/self_post.py
Normal file
|
@ -0,0 +1,46 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import logging
|
||||
from typing import Optional
|
||||
|
||||
from praw.models import Submission
|
||||
|
||||
from bdfr.resource import Resource
|
||||
from bdfr.site_authenticator import SiteAuthenticator
|
||||
from bdfr.site_downloaders.base_downloader import BaseDownloader
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class SelfPost(BaseDownloader):
|
||||
def __init__(self, post: Submission):
|
||||
super().__init__(post)
|
||||
|
||||
def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]:
|
||||
out = Resource(self.post, self.post.url, lambda: None, ".txt")
|
||||
out.content = self.export_to_string().encode("utf-8")
|
||||
out.create_hash()
|
||||
return [out]
|
||||
|
||||
def export_to_string(self) -> str:
|
||||
"""Self posts are formatted here"""
|
||||
content = (
|
||||
"## ["
|
||||
+ self.post.fullname
|
||||
+ "]("
|
||||
+ self.post.url
|
||||
+ ")\n"
|
||||
+ self.post.selftext
|
||||
+ "\n\n---\n\n"
|
||||
+ "submitted to [r/"
|
||||
+ self.post.subreddit.title
|
||||
+ "](https://www.reddit.com/r/"
|
||||
+ self.post.subreddit.title
|
||||
+ ") by [u/"
|
||||
+ (self.post.author.name if self.post.author else "DELETED")
|
||||
+ "](https://www.reddit.com/user/"
|
||||
+ (self.post.author.name if self.post.author else "DELETED")
|
||||
+ ")"
|
||||
)
|
||||
return content
|
55
bdfr/site_downloaders/vidble.py
Normal file
55
bdfr/site_downloaders/vidble.py
Normal file
|
@ -0,0 +1,55 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import itertools
|
||||
import logging
|
||||
import re
|
||||
from typing import Optional
|
||||
|
||||
import bs4
|
||||
import requests
|
||||
from praw.models import Submission
|
||||
|
||||
from bdfr.exceptions import SiteDownloaderError
|
||||
from bdfr.resource import Resource
|
||||
from bdfr.site_authenticator import SiteAuthenticator
|
||||
from bdfr.site_downloaders.base_downloader import BaseDownloader
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class Vidble(BaseDownloader):
|
||||
def __init__(self, post: Submission):
|
||||
super().__init__(post)
|
||||
|
||||
def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]:
|
||||
try:
|
||||
res = self.get_links(self.post.url)
|
||||
except AttributeError:
|
||||
raise SiteDownloaderError(f"Could not read page at {self.post.url}")
|
||||
if not res:
|
||||
raise SiteDownloaderError(rf"No resources found at {self.post.url}")
|
||||
res = [Resource(self.post, r, Resource.retry_download(r)) for r in res]
|
||||
return res
|
||||
|
||||
@staticmethod
|
||||
def get_links(url: str) -> set[str]:
|
||||
if not re.search(r"vidble.com/(show/|album/|watch\?v)", url):
|
||||
url = re.sub(r"/(\w*?)$", r"/show/\1", url)
|
||||
|
||||
page = requests.get(url)
|
||||
soup = bs4.BeautifulSoup(page.text, "html.parser")
|
||||
content_div = soup.find("div", attrs={"id": "ContentPlaceHolder1_divContent"})
|
||||
images = content_div.find_all("img")
|
||||
images = [i.get("src") for i in images]
|
||||
videos = content_div.find_all("source", attrs={"type": "video/mp4"})
|
||||
videos = [v.get("src") for v in videos]
|
||||
resources = filter(None, itertools.chain(images, videos))
|
||||
resources = ["https://www.vidble.com" + r for r in resources]
|
||||
resources = [Vidble.change_med_url(r) for r in resources]
|
||||
return set(resources)
|
||||
|
||||
@staticmethod
|
||||
def change_med_url(url: str) -> str:
|
||||
out = re.sub(r"_med(\..{3,4})$", r"\1", url)
|
||||
return out
|
42
bdfr/site_downloaders/vreddit.py
Normal file
42
bdfr/site_downloaders/vreddit.py
Normal file
|
@ -0,0 +1,42 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import logging
|
||||
from typing import Optional
|
||||
|
||||
from praw.models import Submission
|
||||
|
||||
from bdfr.exceptions import NotADownloadableLinkError
|
||||
from bdfr.resource import Resource
|
||||
from bdfr.site_authenticator import SiteAuthenticator
|
||||
from bdfr.site_downloaders.youtube import Youtube
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class VReddit(Youtube):
|
||||
def __init__(self, post: Submission):
|
||||
super().__init__(post)
|
||||
|
||||
def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]:
|
||||
ytdl_options = {
|
||||
"playlistend": 1,
|
||||
"nooverwrites": True,
|
||||
}
|
||||
download_function = self._download_video(ytdl_options)
|
||||
extension = self.get_video_attributes(self.post.url)["ext"]
|
||||
res = Resource(self.post, self.post.url, download_function, extension)
|
||||
return [res]
|
||||
|
||||
@staticmethod
|
||||
def get_video_attributes(url: str) -> dict:
|
||||
result = VReddit.get_video_data(url)
|
||||
if "ext" in result:
|
||||
return result
|
||||
else:
|
||||
try:
|
||||
result = result["entries"][0]
|
||||
return result
|
||||
except Exception as e:
|
||||
logger.exception(e)
|
||||
raise NotADownloadableLinkError(f"Video info extraction failed for {url}")
|
85
bdfr/site_downloaders/youtube.py
Normal file
85
bdfr/site_downloaders/youtube.py
Normal file
|
@ -0,0 +1,85 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import logging
|
||||
import tempfile
|
||||
from collections.abc import Callable
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import yt_dlp
|
||||
from praw.models import Submission
|
||||
|
||||
from bdfr.exceptions import NotADownloadableLinkError, SiteDownloaderError
|
||||
from bdfr.resource import Resource
|
||||
from bdfr.site_authenticator import SiteAuthenticator
|
||||
from bdfr.site_downloaders.base_downloader import BaseDownloader
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class Youtube(BaseDownloader):
|
||||
def __init__(self, post: Submission):
|
||||
super().__init__(post)
|
||||
|
||||
def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]:
|
||||
ytdl_options = {
|
||||
"format": "best",
|
||||
"playlistend": 1,
|
||||
"nooverwrites": True,
|
||||
}
|
||||
download_function = self._download_video(ytdl_options)
|
||||
extension = self.get_video_attributes(self.post.url)["ext"]
|
||||
res = Resource(self.post, self.post.url, download_function, extension)
|
||||
return [res]
|
||||
|
||||
def _download_video(self, ytdl_options: dict) -> Callable:
|
||||
yt_logger = logging.getLogger("youtube-dl")
|
||||
yt_logger.setLevel(logging.CRITICAL)
|
||||
ytdl_options["quiet"] = True
|
||||
ytdl_options["logger"] = yt_logger
|
||||
|
||||
def download(_: dict) -> bytes:
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
download_path = Path(temp_dir).resolve()
|
||||
ytdl_options["outtmpl"] = str(download_path) + "/" + "test.%(ext)s"
|
||||
try:
|
||||
with yt_dlp.YoutubeDL(ytdl_options) as ydl:
|
||||
ydl.download([self.post.url])
|
||||
except yt_dlp.DownloadError as e:
|
||||
raise SiteDownloaderError(f"Youtube download failed: {e}")
|
||||
|
||||
downloaded_files = list(download_path.iterdir())
|
||||
if downloaded_files:
|
||||
downloaded_file = downloaded_files[0]
|
||||
else:
|
||||
raise NotADownloadableLinkError(f"No media exists in the URL {self.post.url}")
|
||||
with downloaded_file.open("rb") as file:
|
||||
content = file.read()
|
||||
return content
|
||||
|
||||
return download
|
||||
|
||||
@staticmethod
|
||||
def get_video_data(url: str) -> dict:
|
||||
yt_logger = logging.getLogger("youtube-dl")
|
||||
yt_logger.setLevel(logging.CRITICAL)
|
||||
with yt_dlp.YoutubeDL(
|
||||
{
|
||||
"logger": yt_logger,
|
||||
}
|
||||
) as ydl:
|
||||
try:
|
||||
result = ydl.extract_info(url, download=False)
|
||||
except Exception as e:
|
||||
logger.exception(e)
|
||||
raise NotADownloadableLinkError(f"Video info extraction failed for {url}")
|
||||
return result
|
||||
|
||||
@staticmethod
|
||||
def get_video_attributes(url: str) -> dict:
|
||||
result = Youtube.get_video_data(url)
|
||||
if "ext" in result:
|
||||
return result
|
||||
else:
|
||||
raise NotADownloadableLinkError(f"Video info extraction failed for {url}")
|
5
devscripts/configure.ps1
Normal file
5
devscripts/configure.ps1
Normal file
|
@ -0,0 +1,5 @@
|
|||
if (-not ([string]::IsNullOrEmpty($env:REDDIT_TOKEN)))
|
||||
{
|
||||
Copy-Item .\\bdfr\\default_config.cfg .\\test_config.cfg
|
||||
Write-Output "`nuser_token = $env:REDDIT_TOKEN" >> ./test_config.cfg
|
||||
}
|
7
devscripts/configure.sh
Executable file
7
devscripts/configure.sh
Executable file
|
@ -0,0 +1,7 @@
|
|||
#!/bin/bash
|
||||
|
||||
if [ -n "$REDDIT_TOKEN" ]
|
||||
then
|
||||
cp ./bdfr/default_config.cfg ./test_config.cfg
|
||||
echo -e "\nuser_token = $REDDIT_TOKEN" >> ./test_config.cfg
|
||||
fi
|
41
docs/ARCHITECTURE.md
Normal file
41
docs/ARCHITECTURE.md
Normal file
|
@ -0,0 +1,41 @@
|
|||
# Architecture
|
||||
|
||||
When the project was rewritten for v2, the goal was to make the codebase easily extensible and much easier to read and modify. However, this document provides a step-by-step look through the process that the BDFR goes through, so that any prospective developers can more easily grasp the way the code works.
|
||||
|
||||
## Design Ethos
|
||||
|
||||
The BDFR is designed to be a stateless downloader. This means that the state of the program is forgotten between each run of the program. There are no central lists, databases, or indices, that the BDFR uses, only the actual files on disk. There are several advantages to this approach:
|
||||
|
||||
1. There is no chance of the database being corrupted or changed by something other than the BDFR, rendering the BDFR's "idea" of the archive wrong or incomplete.
|
||||
2. Any information about the archive is contained by the archive itself i.e. for a list of all submission IDs in the archive, this can be extracted from the names of the files in said archive, assuming an appropriate naming scheme was used.
|
||||
3. Archives can be merged, split, or editing without worrying about having to update a central database
|
||||
4. There are no versioning issues between updates of the BDFR, where old version are stuck with a worse form of the database
|
||||
5. An archive can be put on a USB, moved to another computer with possibly a very different BDFR version, and work completely fine
|
||||
|
||||
Another major part of the ethos of the design is DOTADIW, Do One Thing And Do It Well. It's a major part of Unix philosophy and states that each tool should have a well-defined, limited purpose. To this end, the BDFR is, as the name implies, a *downloader*. That is the scope of the tool. Managing the files downloaded can be for better-suited programs, since the BDFR is not a file manager. Nor the BDFR concern itself with how any of the data downloaded is displayed, changed, parsed, or analysed. This makes the BDFR suitable for data science-related tasks, archiving, personal downloads, or analysis of various Reddit sources as the BDFR is completely agnostic on how the data is used.
|
||||
|
||||
## The Download Process
|
||||
|
||||
The BDFR is organised around a central object, the RedditDownloader class. The Archiver object extends and inherits from this class.
|
||||
|
||||
1. The RedditDownloader parses all the arguments and configuration options, held in the Configuration object, and creates a variety of internal objects for use, such as the file name formatter, download filter, etc.
|
||||
2. The RedditDownloader scrapes raw submissions from Reddit via several methods relating to different sources. A source is defined as a single stream of submissions from a subreddit, multireddit, or user list.
|
||||
3. These raw submissions are passed to the DownloaderFactory class to select the specialised downloader class to use. Each of these are for a specific website or link type, with some catch-all classes like Direct.
|
||||
4. The BaseDownloader child, spawned by DownloaderFactory, takes the link and does any necessary processing to find the direct link to the actual resource.
|
||||
5. This is returned to the RedditDownloader in the form of a Resource object. This holds the URL and some other information for the final resource.
|
||||
6. The Resource is passed through the DownloadFilter instantiated in step 1.
|
||||
7. The destination file name for the Resource is calculated. If it already exists, then the Resource will be discarded.
|
||||
8. Here the actual data is downloaded to the Resource and a hash calculated which is used to find duplicates.
|
||||
9. Only then is the Resource written to the disk.
|
||||
|
||||
This is the step-by-step process that the BDFR goes through to download a Reddit post.
|
||||
|
||||
## Adding another Supported Site
|
||||
|
||||
This is one of the easiest changes to do with the code. First, any new class must inherit from the BaseDownloader class which provided an abstract parent to implement. However, take note of the other classes as well. Many downloaders can inherit from one another instead of just the BaseDownloader. For example, the VReddit class, used for downloading video from Reddit, inherits almost all of its code from the YouTube class. **Minimise code duplication wherever possible**.
|
||||
|
||||
Once the downloader class has been written **and tests added** for it as well, then the regex string for the site's URLs can be added to the DownloaderFactory. Then additional tests must be added for the DownloadFactory to ensure that the appropriate classes are called when the right URLs are passed to the factory.
|
||||
|
||||
## Adding Other Features
|
||||
|
||||
For a fundamentally different form of execution path for the program, such as the difference between the `archive` and `download` commands, it is best to inherit from the RedditDownloader class and override or add functionality as needed.
|
|
@ -1,76 +0,0 @@
|
|||
# Changes on *master*
|
||||
## [16/08/2018](https://github.com/aliparlakci/bulk-downloader-for-reddit/tree/d56efed1c6833a66322d9158523b89d0ce57f5de)
|
||||
- Fix the bug that prevents downloading imgur videos
|
||||
|
||||
## [15/08/2018](https://github.com/aliparlakci/bulk-downloader-for-reddit/tree/adccd8f3ba03ad124d58643d78dab287a4123a6f)
|
||||
- Prints out the title of posts' that are already downloaded
|
||||
|
||||
## [13/08/2018](https://github.com/aliparlakci/bulk-downloader-for-reddit/tree/50cb7c15b9cb4befce0cfa2c23ab5de4af9176c6)
|
||||
- Added alternative location of current directory for config file
|
||||
- Fixed console prints on Linux
|
||||
|
||||
## [10/08/2018](https://github.com/aliparlakci/bulk-downloader-for-reddit/tree/8f1ff10a5e11464575284210dbba4a0d387bc1c3)
|
||||
- Added reddit username to config file
|
||||
|
||||
## [06/08/2018](https://github.com/aliparlakci/bulk-downloader-for-reddit/tree/210238d0865febcb57fbd9f0b0a7d3da9dbff384)
|
||||
- Sending headers when requesting a file in order not to be rejected by server
|
||||
|
||||
## [04/08/2018](https://github.com/aliparlakci/bulk-downloader-for-reddit/tree/426089d0f35212148caff0082708a87017757bde)
|
||||
- Disabled printing post types to console
|
||||
|
||||
## [30/07/2018](https://github.com/aliparlakci/bulk-downloader-for-reddit/tree/af294929510f884d92b25eaa855c29fc4fb6dcaa)
|
||||
- Now opens web browser and goes to Imgur when prompts for Imgur credentials
|
||||
|
||||
## [26/07/2018](https://github.com/aliparlakci/bulk-downloader-for-reddit/tree/1623722138bad80ae39ffcd5fb38baf80680deac)
|
||||
- Improved verbose mode
|
||||
- Minimalized the console output
|
||||
- Added quit option for auto quitting the program after process finishes
|
||||
|
||||
## [25/07/2018](https://github.com/aliparlakci/bulk-downloader-for-reddit/tree/1623722138bad80ae39ffcd5fb38baf80680deac)
|
||||
- Added verbose mode
|
||||
- Stylized the console output
|
||||
|
||||
## [24/07/2018](https://github.com/aliparlakci/bulk-downloader-for-reddit/tree/7a68ff3efac9939f9574c2cef6184b92edb135f4)
|
||||
- Added OP's name to file names (backwards compatible)
|
||||
- Deleted # char from file names (backwards compatible)
|
||||
- Improved exception handling
|
||||
|
||||
## [23/07/2018](https://github.com/aliparlakci/bulk-downloader-for-reddit/tree/7314e17125aa78fd4e6b28e26fda7ec7db7e0147)
|
||||
- Splited download() function
|
||||
- Added erome support
|
||||
- Removed exclude feature
|
||||
- Bug fixes
|
||||
|
||||
## [22/07/2018](https://github.com/aliparlakci/bulk-downloader-for-reddit/tree/6e7463005051026ad64006a8580b0b5dc9536b8c)
|
||||
- Put log files in a folder named "LOG_FILES"
|
||||
- Fixed the bug that makes multireddit mode unusable
|
||||
|
||||
## [21/07/2018](https://github.com/aliparlakci/bulk-downloader-for-reddit/tree/4a8c2377f9fb4d60ed7eeb8d50aaf9a26492462a)
|
||||
- Added exclude mode
|
||||
|
||||
## [20/07/2018](https://github.com/aliparlakci/bulk-downloader-for-reddit/tree/7548a010198fb693841ca03654d2c9bdf5742139)
|
||||
- "0" input for no limit
|
||||
- Fixed the bug that recognizes none image direct links as image links
|
||||
|
||||
## [19/07/2018](https://github.com/aliparlakci/bulk-downloader-for-reddit/tree/41cbb58db34f500a8a5ecc3ac4375bf6c3b275bb)
|
||||
- Added v.redd.it support
|
||||
- Added custom exception descriptions to FAILED.json file
|
||||
- Fixed the bug that prevents downloading some gfycat URLs
|
||||
|
||||
## [13/07/2018](https://github.com/aliparlakci/bulk-downloader-for-reddit/tree/9f831e1b784a770c82252e909462871401a05c11)
|
||||
- Changed config.json file's path to home directory
|
||||
|
||||
## [12/07/2018](https://github.com/aliparlakci/bulk-downloader-for-reddit/tree/50a77f6ba54c24f5647d5ea4e177400b71ff04a7)
|
||||
- Added binaries for Windows and Linux
|
||||
- Wait on KeyboardInterrupt
|
||||
- Accept multiple subreddit input
|
||||
- Fixed the bug that prevents choosing "[0] exit" with typing "exit"
|
||||
|
||||
## [11/07/2018](https://github.com/aliparlakci/bulk-downloader-for-reddit/tree/a28a7776ab826dea2a8d93873a94cd46db3a339b)
|
||||
- Improvements on UX and UI
|
||||
- Added logging errors to CONSOLE_LOG.txt
|
||||
- Using current directory if directory has not been given yet.
|
||||
|
||||
## [10/07/2018](https://github.com/aliparlakci/bulk-downloader-for-reddit/tree/ffe3839aee6dc1a552d95154d817aefc2b66af81)
|
||||
- Added support for *self* post
|
||||
- Now getting posts is quicker
|
74
docs/CODE_OF_CONDUCT.md
Normal file
74
docs/CODE_OF_CONDUCT.md
Normal file
|
@ -0,0 +1,74 @@
|
|||
# Contributor Covenant Code of Conduct
|
||||
|
||||
## Our Pledge
|
||||
|
||||
In the interest of fostering an open and welcoming environment, we as
|
||||
contributors and maintainers pledge to making participation in our project and
|
||||
our community a harassment-free experience for everyone, regardless of age, body
|
||||
size, disability, ethnicity, gender identity and expression, level of experience,
|
||||
education, socio-economic status, nationality, personal appearance, race,
|
||||
religion, or sexual identity and orientation.
|
||||
|
||||
## Our Standards
|
||||
|
||||
Examples of behavior that contributes to creating a positive environment
|
||||
include:
|
||||
|
||||
* Using welcoming and inclusive language
|
||||
* Being respectful of differing viewpoints and experiences
|
||||
* Gracefully accepting constructive criticism
|
||||
* Focusing on what is best for the community
|
||||
* Showing empathy towards other community members
|
||||
|
||||
Examples of unacceptable behavior by participants include:
|
||||
|
||||
* The use of sexualized language or imagery and unwelcome sexual attention or
|
||||
advances
|
||||
* Trolling, insulting/derogatory comments, and personal or political attacks
|
||||
* Public or private harassment
|
||||
* Publishing others' private information, such as a physical or electronic
|
||||
address, without explicit permission
|
||||
* Other conduct which could reasonably be considered inappropriate in a
|
||||
professional setting
|
||||
|
||||
## Our Responsibilities
|
||||
|
||||
Project maintainers are responsible for clarifying the standards of acceptable
|
||||
behavior and are expected to take appropriate and fair corrective action in
|
||||
response to any instances of unacceptable behavior.
|
||||
|
||||
Project maintainers have the right and responsibility to remove, edit, or
|
||||
reject comments, commits, code, wiki edits, issues, and other contributions
|
||||
that are not aligned to this Code of Conduct, or to ban temporarily or
|
||||
permanently any contributor for other behaviors that they deem inappropriate,
|
||||
threatening, offensive, or harmful.
|
||||
|
||||
## Scope
|
||||
|
||||
This Code of Conduct applies both within project spaces and in public spaces
|
||||
when an individual is representing the project or its community. Examples of
|
||||
representing a project or community include using an official project e-mail
|
||||
address, posting via an official social media account, or acting as an appointed
|
||||
representative at an online or offline event. Representation of a project may be
|
||||
further defined and clarified by project maintainers.
|
||||
|
||||
## Enforcement
|
||||
|
||||
Instances of abusive, harassing, or otherwise unacceptable behavior may be
|
||||
reported by contacting the project team via Discord. All complaints will
|
||||
be reviewed and investigated and will result in a response that is deemed
|
||||
necessary and appropriate to the circumstances. The project team is
|
||||
obligated to maintain confidentiality with regard to the reporter of an
|
||||
incident. Further details of specific enforcement policies may be posted
|
||||
separately.
|
||||
|
||||
Project maintainers who do not follow or enforce the Code of Conduct in good
|
||||
faith may face temporary or permanent repercussions as determined by other
|
||||
members of the project's leadership.
|
||||
|
||||
## Attribution
|
||||
|
||||
This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
|
||||
available at <https://www.contributor-covenant.org/version/1/4/code-of-conduct.html>
|
||||
|
||||
[homepage]: https://www.contributor-covenant.org
|
|
@ -1,101 +0,0 @@
|
|||
# Using command-line arguments
|
||||
|
||||
See **[compiling from source](COMPILE_FROM_SOURCE.md)** page first unless you are using an executable file. If you are using an executable file, see [using terminal](COMPILE_FROM_SOURCE.md#using-terminal) and come back.
|
||||
|
||||
***Use*** `.\bulk-downloader-for-reddit.exe` ***or*** `./bulk-downloader-for-reddit` ***if you are using the executable***.
|
||||
```console
|
||||
$ python script.py --help
|
||||
usage: script.py [-h] [--directory DIRECTORY] [--NoDownload] [--verbose]
|
||||
[--quit] [--link link] [--saved] [--submitted] [--upvoted]
|
||||
[--log LOG FILE] [--subreddit SUBREDDIT [SUBREDDIT ...]]
|
||||
[--multireddit MULTIREDDIT] [--user redditor]
|
||||
[--search query] [--sort SORT TYPE] [--limit Limit]
|
||||
[--time TIME_LIMIT]
|
||||
|
||||
This program downloads media from reddit posts
|
||||
|
||||
optional arguments:
|
||||
-h, --help show this help message and exit
|
||||
--directory DIRECTORY, -d DIRECTORY
|
||||
Specifies the directory where posts will be downloaded
|
||||
to
|
||||
--NoDownload Just gets the posts and stores them in a file for
|
||||
downloading later
|
||||
--verbose, -v Verbose Mode
|
||||
--quit, -q Auto quit afer the process finishes
|
||||
--link link, -l link Get posts from link
|
||||
--saved Triggers saved mode
|
||||
--submitted Gets posts of --user
|
||||
--upvoted Gets upvoted posts of --user
|
||||
--log LOG FILE Takes a log file which created by itself (json files),
|
||||
reads posts and tries downloading them again.
|
||||
--subreddit SUBREDDIT [SUBREDDIT ...]
|
||||
Triggers subreddit mode and takes subreddit's name
|
||||
without r/. use "frontpage" for frontpage
|
||||
--multireddit MULTIREDDIT
|
||||
Triggers multireddit mode and takes multireddit's name
|
||||
without m/
|
||||
--user redditor reddit username if needed. use "me" for current user
|
||||
--search query Searches for given query in given subreddits
|
||||
--sort SORT TYPE Either hot, top, new, controversial, rising or
|
||||
relevance default: hot
|
||||
--limit Limit default: unlimited
|
||||
--time TIME_LIMIT Either hour, day, week, month, year or all. default:
|
||||
all
|
||||
```
|
||||
|
||||
# Examples
|
||||
|
||||
- **Use `python3` instead of `python` if you are using *MacOS* or *Linux***
|
||||
|
||||
```console
|
||||
python script.py
|
||||
```
|
||||
|
||||
```console
|
||||
.\bulk-downloader-for-reddit.exe
|
||||
```
|
||||
|
||||
```console
|
||||
python script.py
|
||||
```
|
||||
|
||||
```console
|
||||
.\bulk-downloader-for-reddit.exe -- directory .\\NEW_FOLDER --search cats --sort new --time all --subreddit gifs pics --NoDownload
|
||||
```
|
||||
|
||||
```console
|
||||
./bulk-downloader-for-reddit --directory .\\NEW_FOLDER\\ANOTHER_FOLDER --saved --limit 1000
|
||||
```
|
||||
|
||||
```console
|
||||
python script.py --directory .\\NEW_FOLDER --sort new --time all --limit 10 --link "https://www.reddit.com/r/gifs/search?q=dogs&restrict_sr=on&type=link&sort=new&t=month"
|
||||
```
|
||||
|
||||
```console
|
||||
python script.py --directory .\\NEW_FOLDER --link "https://www.reddit.com/r/learnprogramming/comments/7mjw12/"
|
||||
```
|
||||
|
||||
```console
|
||||
python script.py --directory .\\NEW_FOLDER --search cats --sort new --time all --subreddit gifs pics --NoDownload
|
||||
```
|
||||
|
||||
```console
|
||||
python script.py --directory .\\NEW_FOLDER --user [USER_NAME] --submitted --limit 10
|
||||
```
|
||||
|
||||
```console
|
||||
python script.py --directory .\\NEW_FOLDER --multireddit good_subs --user [USER_NAME] --sort top --time week --limit 250
|
||||
```
|
||||
|
||||
```console
|
||||
python script.py --directory .\\NEW_FOLDER\\ANOTHER_FOLDER --saved --limit 1000
|
||||
```
|
||||
|
||||
```console
|
||||
python script.py --directory C:\\NEW_FOLDER\\ANOTHER_FOLDER --log UNNAMED_FOLDER\\FAILED.json
|
||||
```
|
||||
|
||||
# FAQ
|
||||
## I can't startup the script no matter what.
|
||||
See **[finding the correct keyword for Python](COMPILE_FROM_SOURCE.md#finding-the-correct-keyword-for-python)**
|
|
@ -1,40 +0,0 @@
|
|||
# Compiling from source code
|
||||
## Requirements
|
||||
### Python 3 Interpreter
|
||||
Latest* version of **Python 3** is needed. See if it is already installed [here](#finding-the-correct-keyword-for-python). If not, download the matching release for your platform [here](https://www.python.org/downloads/) and install it. If you are a *Windows* user, selecting **Add Python 3 to PATH** option when installing the software is mandatory.
|
||||
|
||||
\* *Use Python 3.6.5 if you encounter an issue*
|
||||
## Using terminal
|
||||
### To open it...
|
||||
- **On Windows**: Press **Shift+Right Click**, select **Open Powershell window here** or **Open Command Prompt window here**
|
||||
|
||||
- **On Linux**: Right-click in a folder and select **Open Terminal** or press **Ctrl+Alt+T**.
|
||||
|
||||
- **On MacOS**: Look for an app called **Terminal**.
|
||||
|
||||
### Navigating to the directory where script is downloaded
|
||||
Go inside the folder where script.py is located. If you are not familiar with changing directories on command-prompt and terminal read *Changing Directories* in [this article](https://lifehacker.com/5633909/who-needs-a-mouse-learn-to-use-the-command-line-for-almost-anything)
|
||||
|
||||
## Finding the correct keyword for Python
|
||||
Enter these lines to terminal window until it prints out the version you have downloaded and installed:
|
||||
|
||||
- `python --version`
|
||||
- `python3 --version`
|
||||
- `python3.7 --version`
|
||||
- `python3.6 --version`
|
||||
- `py --version`
|
||||
- `py -3 --version`
|
||||
- `py -3.6 --version`
|
||||
- `py -3.7 --version`
|
||||
|
||||
Once it does, your keyword is without the `--version` part.
|
||||
|
||||
## Installing dependencies
|
||||
Enter the line below to terminal window when you are in the directory where script.py is, use your keyword for Python:
|
||||
```console
|
||||
python -m pip install -r requirements.txt
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
Now, you can go to [Using command-line arguments](COMMAND_LINE_ARGUMENTS.md)
|
142
docs/CONTRIBUTING.md
Normal file
142
docs/CONTRIBUTING.md
Normal file
|
@ -0,0 +1,142 @@
|
|||
# Contributing
|
||||
|
||||
When making a contribution to the BDFR project, please open an issue beforehand so that the maintainers can weigh in on it. This helps create a trail on GitHub and keeps things organised.
|
||||
|
||||
**Please don't open an issue on GitHub** unless you are reporting a bug or proposing a feature. For questions, there is a discussion tab on the repository's GitHub page where you can interact with the developers and ask questions. If you believe that something is a bug, or that a feature should be added, then by all means open an issue.
|
||||
|
||||
All communication on GitHub, Discord, email, or any other medium must conform to the [Code of Conduct](CODE_OF_CONDUCT.md). It's not that hard to stay respectful.
|
||||
|
||||
## Opening an Issue
|
||||
|
||||
**Before opening a new issue**, be sure that no issues regarding your problem already exist. If a similar issue exists, try to contribute to the issue.
|
||||
|
||||
### Bugs
|
||||
|
||||
When opening an issue about a bug, **please provide the full log file for the run in which the bug occurred**. This log file is named `log_output.txt` in the configuration folder. Check the [README](../README.md) for information on where this is. This log file will contain all the information required for the developers to recreate the bug.
|
||||
|
||||
If you do not have or cannot find the log file, then at minimum please provide the **Reddit ID for the submission** or comment which caused the issue. Also copy in the command that you used to run the BDFR from the command line, as that will also provide helpful information when trying to find and fix the bug. If needed, more information will be asked in the thread of the bug.
|
||||
|
||||
### Feature requests
|
||||
|
||||
In the case of requesting a feature or an enhancement, there are fewer requirements. However, please be clear in what you would like the BDFR to do and also how the feature/enhancement would be used or would be useful to more people. It is crucial that the feature is justified. Any feature request without a concrete reason for it to be implemented has a very small chance to get accepted. Be aware that proposed enhancements may be rejected for multiple reasons, or no reason, at the discretion of the developers.
|
||||
|
||||
## Pull Requests
|
||||
|
||||
Before creating a pull request (PR), check out [ARCHITECTURE](ARCHITECTURE.md) for a short introduction to the way that the BDFR is coded and how the code is organised. Also read the [Style Guide](#style-guide) section below before actually writing any code.
|
||||
|
||||
Once you have done both of these, the below list shows the path that should be followed when writing a PR.
|
||||
|
||||
1. If an issue does not already exist, open one that will relate to the PR.
|
||||
2. Ensure that any changes fit into the architecture specified above.
|
||||
3. Ensure that you have written tests that cover the new code.
|
||||
4. Ensure that no existing tests fail, unless there is a good reason for them to do so.
|
||||
5. If needed, update any documentation with changes.
|
||||
6. Open a pull request that references the relevant issue.
|
||||
7. Expect changes or suggestions and heed the Code of Conduct. We're all volunteers here.
|
||||
|
||||
Someone will review your pull request as soon as possible, but remember that all maintainers are volunteers and this won't happen immediately. Once it is approved, congratulations! Your code is now part of the BDFR.
|
||||
|
||||
## Preparing the environment for development
|
||||
|
||||
Bulk Downloader for Reddit requires Python 3.9 at minimum. First, ensure that your Python installation satisfies this.
|
||||
|
||||
BDfR is built in a way that it can be packaged and installed via `pip`. This places BDfR next to other Python packages and enables you to run the program from any directory. Since it is managed by pip, you can also uninstall it.
|
||||
|
||||
To install the program, clone the repository and run pip inside the project's root directory:
|
||||
|
||||
```bash
|
||||
git clone https://github.com/aliparlakci/bulk-downloader-for-reddit.git
|
||||
cd ./bulk-downloader-for-reddit
|
||||
python3 -m pip install -e .
|
||||
```
|
||||
|
||||
**`-e`** parameter creates a link to that folder. That is, any change inside the folder affects the package immidiately. So, when developing, you can be sure that the package is not stale and Python is always running your latest changes. (Due to this linking, moving/removing/renaming the folder might break it)
|
||||
|
||||
Then, you can run the program from anywhere in your disk as such:
|
||||
|
||||
```bash
|
||||
bdfr
|
||||
```
|
||||
|
||||
There are additional Python packages that are required to develop the BDFR. These can be installed with the following command:
|
||||
|
||||
```bash
|
||||
python3 -m pip install -e .[dev]
|
||||
```
|
||||
|
||||
### Tools
|
||||
|
||||
The BDFR project uses several tools to manage the code of the project. These include:
|
||||
|
||||
- [black](https://github.com/psf/black)
|
||||
- [flake8](https://github.com/john-hen/Flake8-pyproject)
|
||||
- [isort](https://github.com/PyCQA/isort)
|
||||
- [markdownlint (mdl)](https://github.com/markdownlint/markdownlint)
|
||||
- [tox](https://tox.wiki/en/latest/)
|
||||
- [pre-commit](https://github.com/pre-commit/pre-commit)
|
||||
|
||||
The first four tools are formatters. These change the code to the standards expected for the BDFR project. The configuration details for these tools are contained in the [pyproject.toml](../pyproject.toml) file for the project.
|
||||
|
||||
The tool `tox` is used to run tests and tools on demand and has the following environments:
|
||||
|
||||
- `format`
|
||||
- `format_check`
|
||||
|
||||
The tool `pre-commit` is optional, and runs the three formatting tools automatically when a commit is made. This is **highly recommended** to ensure that all code submitted for this project is formatted acceptably. Note that any PR that does not follow the formatting guide will not be accepted. For information on how to use pre-commit to avoid this, see [the pre-commit documentation](https://pre-commit.com/).
|
||||
|
||||
## Style Guide
|
||||
|
||||
The BDFR uses the Black formatting standard and enforces this with the tool by the same name. Additionally, the tool isort is used as well to format imports.
|
||||
|
||||
See [Preparing the Environment for Development](#preparing-the-environment-for-development) for how to setup these tools to run automatically.
|
||||
|
||||
## Tests
|
||||
|
||||
### Running Tests
|
||||
|
||||
There are a lot of tests in the BDFR. In fact, there are more tests than lines of functional code. This is one of the strengths of the BDFR in that it is fully tested. The codebase uses the package pytest to create the tests, which is a third-party package that provides many functions and objects useful for testing Python code.
|
||||
|
||||
When submitting a PR, it is required that you run **all** possible tests to ensure that any new commits haven't broken anything. Otherwise, while writing the request, it can be helpful (and much quicker) to run only a subset of the tests.
|
||||
|
||||
This is accomplished with marks, a system that pytest uses to categorise tests. There are currently the current marks in use in the BDFR test suite.
|
||||
|
||||
- `slow`
|
||||
- This marks a test that may take a long time to complete
|
||||
- Usually marks a test that downloads many submissions or downloads a particularly large resource
|
||||
- `online`
|
||||
- This marks a test that requires an internet connection and uses online resources
|
||||
- `reddit`
|
||||
- This marks a test that accesses online Reddit specifically
|
||||
- `authenticated`
|
||||
- This marks a test that requires a test configuration file with a valid OAuth2 token
|
||||
|
||||
These tests can be run either all at once, or excluding certain marks. The tests that require online resources, such as those marked `reddit` or `online`, will naturally require more time to run than tests that are entirely offline. To run tests, you must be in the root directory of the project and can use the following command.
|
||||
|
||||
```bash
|
||||
pytest
|
||||
```
|
||||
|
||||
To exclude one or more marks, the following command can be used, substituting the unwanted mark.
|
||||
|
||||
```bash
|
||||
pytest -m "not online"
|
||||
pytest -m "not reddit and not authenticated"
|
||||
```
|
||||
|
||||
### Configuration for authenticated tests
|
||||
|
||||
There should be configuration file `test_config.cfg` in the project's root directory to be able to run the integration tests with reddit authentication. See how to create such files [here](../README.md#configuration). The easiest way of creating this file is copying your existing `default_config.cfg` file from the path stated in the previous link and renaming it to `test_config.cfg` Be sure that user_token key exists in test_config.cfg.
|
||||
|
||||
---
|
||||
|
||||
For more details, review the pytest documentation that is freely available online.
|
||||
|
||||
Many IDEs also provide integrated functionality to run and display the results from tests, and almost all of them support pytest in some capacity. This would be the recommended method due to the additional debugging and general capabilities.
|
||||
|
||||
### Writing Tests
|
||||
|
||||
When writing tests, ensure that they follow the style guide. The BDFR uses pytest to run tests. Wherever possible, parameterise tests, even if you only have one test case. This makes it easier to expand in the future, as the ultimate goal is to have multiple test cases for every test, instead of just one.
|
||||
|
||||
If required, use of mocks is expected to simplify tests and reduce the resources or complexity required. Tests should be as small as possible and test as small a part of the code as possible. Comprehensive or integration tests are run with the `click` framework and are located in their own file.
|
||||
|
||||
It is also expected that new tests be classified correctly with the marks described above i.e. if a test accesses Reddit through a `reddit_instance` object, it must be given the `reddit` mark. If it requires an authenticated Reddit instance, then it must have the `authenticated` mark.
|
23
docs/FAQ.md
23
docs/FAQ.md
|
@ -1,23 +0,0 @@
|
|||
# FAQ
|
||||
## What do the dots resemble when getting posts?
|
||||
- Each dot means that 100 posts are scanned.
|
||||
|
||||
## Getting posts is taking too long.
|
||||
- You can press Ctrl+C to interrupt it and start downloading.
|
||||
|
||||
## How are filenames formatted?
|
||||
- Self posts and images that are not belong to an album are formatted as **`[SUBMITTER NAME]_[POST TITLE]_[REDDIT ID]`**.
|
||||
You can use *reddit id* to go to post's reddit page by going to link **reddit.com/[REDDIT ID]**
|
||||
|
||||
- An image in an imgur album is formatted as **`[ITEM NUMBER]_[IMAGE TITLE]_[IMGUR ID]`**
|
||||
Similarly, you can use *imgur id* to go to image's imgur page by going to link **imgur.com/[IMGUR ID]**.
|
||||
|
||||
## How do I open self post files?
|
||||
- Self posts are held at reddit as styled with markdown. So, the script downloads them as they are in order not to lose their stylings.
|
||||
However, there is a [great Chrome extension](https://chrome.google.com/webstore/detail/markdown-viewer/ckkdlimhmcjmikdlpkmbgfkaikojcbjk) for viewing Markdown files with its styling. Install it and open the files with [Chrome](https://www.google.com/intl/tr/chrome/).
|
||||
|
||||
However, they are basically text files. You can also view them with any text editor such as Notepad on Windows, gedit on Linux or Text Editor on MacOS
|
||||
|
||||
## How can I change my credentials?
|
||||
- All of the user data is held in **config.json** file which is in a folder named "Bulk Downloader for Reddit" in your **Home** directory. You can edit
|
||||
them, there.
|
9
opts_example.yaml
Normal file
9
opts_example.yaml
Normal file
|
@ -0,0 +1,9 @@
|
|||
skip: [mp4, avi, mov]
|
||||
file_scheme: "{UPVOTES}_{REDDITOR}_{POSTID}_{DATE}"
|
||||
limit: 10
|
||||
sort: top
|
||||
time: all
|
||||
no_dupes: true
|
||||
subreddit:
|
||||
- EarthPorn
|
||||
- CityPorn
|
88
pyproject.toml
Normal file
88
pyproject.toml
Normal file
|
@ -0,0 +1,88 @@
|
|||
[build-system]
|
||||
requires = ["setuptools>=65.6.0", "wheel"]
|
||||
build-backend = "setuptools.build_meta"
|
||||
|
||||
[project]
|
||||
name = "bdfr"
|
||||
description = "Downloads and archives content from reddit"
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.9"
|
||||
license = {file = "LICENSE"}
|
||||
keywords = ["reddit", "download", "archive",]
|
||||
authors = [{name = "Ali Parlakci", email = "parlakciali@gmail.com"}]
|
||||
maintainers = [{name = "Serene Arc", email = "serenical@gmail.com"}]
|
||||
classifiers = [
|
||||
"Development Status :: 5 - Production/Stable",
|
||||
"Environment :: Console",
|
||||
"License :: OSI Approved :: GNU General Public License v3 (GPLv3)",
|
||||
"Natural Language :: English",
|
||||
"Operating System :: OS Independent",
|
||||
"Programming Language :: Python :: 3",
|
||||
"Programming Language :: Python :: 3.9",
|
||||
"Programming Language :: Python :: 3.10",
|
||||
"Programming Language :: Python :: 3.11",
|
||||
]
|
||||
dependencies = [
|
||||
"appdirs>=1.4.4",
|
||||
"beautifulsoup4>=4.10.0",
|
||||
"click>=8.0.0",
|
||||
"dict2xml>=1.7.0",
|
||||
"praw>=7.2.0",
|
||||
"pyyaml>=5.4.1",
|
||||
"requests>=2.25.1",
|
||||
"yt-dlp>=2022.11.11",
|
||||
]
|
||||
dynamic = ["version"]
|
||||
|
||||
[tool.setuptools]
|
||||
dynamic = {"version" = {attr = 'bdfr.__version__'}}
|
||||
packages = ["bdfr", "bdfr.archive_entry", "bdfr.site_downloaders", "bdfr.site_downloaders.fallback_downloaders",]
|
||||
data-files = {"config" = ["bdfr/default_config.cfg",]}
|
||||
|
||||
[project.optional-dependencies]
|
||||
dev = [
|
||||
"black>=22.12.0",
|
||||
"Flake8-pyproject>=1.2.2",
|
||||
"isort>=5.11.4",
|
||||
"pre-commit>=2.20.0",
|
||||
"pytest>=7.1.0",
|
||||
"tox>=3.27.1",
|
||||
]
|
||||
|
||||
[project.urls]
|
||||
"Homepage" = "https://aliparlakci.github.io/bulk-downloader-for-reddit"
|
||||
"Source" = "https://github.com/aliparlakci/bulk-downloader-for-reddit"
|
||||
"Bug Reports" = "https://github.com/aliparlakci/bulk-downloader-for-reddit/issues"
|
||||
|
||||
[project.scripts]
|
||||
bdfr = "bdfr.__main__:cli"
|
||||
bdfr-archive = "bdfr.__main__:cli_archive"
|
||||
bdfr-clone = "bdfr.__main__:cli_clone"
|
||||
bdfr-download = "bdfr.__main__:cli_download"
|
||||
|
||||
[tool.black]
|
||||
line-length = 120
|
||||
|
||||
[tool.flake8]
|
||||
exclude = ["scripts"]
|
||||
max-line-length = 120
|
||||
show-source = true
|
||||
statistics = true
|
||||
|
||||
[tool.isort]
|
||||
profile = "black"
|
||||
py_version = 39
|
||||
multi_line_output = 3
|
||||
line_length = 120
|
||||
indent = 4
|
||||
|
||||
[tool.pytest.ini_options]
|
||||
minversion = "7.1"
|
||||
addopts = "--strict-markers"
|
||||
testpaths = "tests"
|
||||
markers = [
|
||||
"online: tests require a connection to the internet",
|
||||
"reddit: tests require a connection to Reddit",
|
||||
"slow: test is slow to run",
|
||||
"authenticated: test requires an authenticated Reddit instance",
|
||||
]
|
|
@ -1,3 +0,0 @@
|
|||
requests
|
||||
praw
|
||||
imgurpython
|
710
script.py
710
script.py
|
@ -1,710 +0,0 @@
|
|||
#!/usr/bin/env python
|
||||
|
||||
"""
|
||||
This program downloads imgur, gfycat and direct image and video links of
|
||||
saved posts from a reddit account. It is written in Python 3.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import webbrowser
|
||||
from io import StringIO
|
||||
from pathlib import Path, PurePath
|
||||
|
||||
from src.downloader import Direct, Erome, Gfycat, Imgur, Self
|
||||
from src.errors import *
|
||||
from src.parser import LinkDesigner
|
||||
from src.searcher import getPosts
|
||||
from src.tools import (GLOBAL, createLogFile, jsonFile, nameCorrector,
|
||||
printToFile)
|
||||
|
||||
__author__ = "Ali Parlakci"
|
||||
__license__ = "GPL"
|
||||
__version__ = "1.6.2.1"
|
||||
__maintainer__ = "Ali Parlakci"
|
||||
__email__ = "parlakciali@gmail.com"
|
||||
|
||||
def getConfig(configFileName):
|
||||
"""Read credentials from config.json file"""
|
||||
|
||||
keys = ['imgur_client_id',
|
||||
'imgur_client_secret']
|
||||
|
||||
if os.path.exists(configFileName):
|
||||
FILE = jsonFile(configFileName)
|
||||
content = FILE.read()
|
||||
if "reddit_refresh_token" in content:
|
||||
if content["reddit_refresh_token"] == "":
|
||||
FILE.delete("reddit_refresh_token")
|
||||
|
||||
if not all(False if content.get(key,"") == "" else True for key in keys):
|
||||
print(
|
||||
"Go to this URL and fill the form: " \
|
||||
"https://api.imgur.com/oauth2/addclient\n" \
|
||||
"Enter the client id and client secret here:"
|
||||
)
|
||||
webbrowser.open("https://api.imgur.com/oauth2/addclient",new=2)
|
||||
|
||||
for key in keys:
|
||||
try:
|
||||
if content[key] == "":
|
||||
raise KeyError
|
||||
except KeyError:
|
||||
FILE.add({key:input(" "+key+": ")})
|
||||
return jsonFile(configFileName).read()
|
||||
|
||||
else:
|
||||
FILE = jsonFile(configFileName)
|
||||
configDictionary = {}
|
||||
print(
|
||||
"Go to this URL and fill the form: " \
|
||||
"https://api.imgur.com/oauth2/addclient\n" \
|
||||
"Enter the client id and client secret here:"
|
||||
)
|
||||
webbrowser.open("https://api.imgur.com/oauth2/addclient",new=2)
|
||||
for key in keys:
|
||||
configDictionary[key] = input(" "+key+": ")
|
||||
FILE.add(configDictionary)
|
||||
return FILE.read()
|
||||
|
||||
def parseArguments(arguments=[]):
|
||||
"""Initialize argparse and add arguments"""
|
||||
|
||||
parser = argparse.ArgumentParser(allow_abbrev=False,
|
||||
description="This program downloads " \
|
||||
"media from reddit " \
|
||||
"posts")
|
||||
parser.add_argument("--directory","-d",
|
||||
help="Specifies the directory where posts will be " \
|
||||
"downloaded to",
|
||||
metavar="DIRECTORY")
|
||||
|
||||
parser.add_argument("--NoDownload",
|
||||
help="Just gets the posts and stores them in a file" \
|
||||
" for downloading later",
|
||||
action="store_true",
|
||||
default=False)
|
||||
|
||||
parser.add_argument("--verbose","-v",
|
||||
help="Verbose Mode",
|
||||
action="store_true",
|
||||
default=False)
|
||||
|
||||
parser.add_argument("--quit","-q",
|
||||
help="Auto quit afer the process finishes",
|
||||
action="store_true",
|
||||
default=False)
|
||||
|
||||
parser.add_argument("--link","-l",
|
||||
help="Get posts from link",
|
||||
metavar="link")
|
||||
|
||||
parser.add_argument("--saved",
|
||||
action="store_true",
|
||||
help="Triggers saved mode")
|
||||
|
||||
parser.add_argument("--submitted",
|
||||
action="store_true",
|
||||
help="Gets posts of --user")
|
||||
|
||||
parser.add_argument("--upvoted",
|
||||
action="store_true",
|
||||
help="Gets upvoted posts of --user")
|
||||
|
||||
parser.add_argument("--log",
|
||||
help="Takes a log file which created by itself " \
|
||||
"(json files), reads posts and tries downloadin" \
|
||||
"g them again.",
|
||||
# type=argparse.FileType('r'),
|
||||
metavar="LOG FILE")
|
||||
|
||||
parser.add_argument("--subreddit",
|
||||
nargs="+",
|
||||
help="Triggers subreddit mode and takes subreddit's " \
|
||||
"name without r/. use \"frontpage\" for frontpage",
|
||||
metavar="SUBREDDIT",
|
||||
type=str)
|
||||
|
||||
parser.add_argument("--multireddit",
|
||||
help="Triggers multireddit mode and takes "\
|
||||
"multireddit's name without m/",
|
||||
metavar="MULTIREDDIT",
|
||||
type=str)
|
||||
|
||||
parser.add_argument("--user",
|
||||
help="reddit username if needed. use \"me\" for " \
|
||||
"current user",
|
||||
required="--multireddit" in sys.argv or \
|
||||
"--submitted" in sys.argv,
|
||||
metavar="redditor",
|
||||
type=str)
|
||||
|
||||
parser.add_argument("--search",
|
||||
help="Searches for given query in given subreddits",
|
||||
metavar="query",
|
||||
type=str)
|
||||
|
||||
parser.add_argument("--sort",
|
||||
help="Either hot, top, new, controversial, rising " \
|
||||
"or relevance default: hot",
|
||||
choices=[
|
||||
"hot","top","new","controversial","rising",
|
||||
"relevance"
|
||||
],
|
||||
metavar="SORT TYPE",
|
||||
type=str)
|
||||
|
||||
parser.add_argument("--limit",
|
||||
help="default: unlimited",
|
||||
metavar="Limit",
|
||||
type=int)
|
||||
|
||||
parser.add_argument("--time",
|
||||
help="Either hour, day, week, month, year or all." \
|
||||
" default: all",
|
||||
choices=["all","hour","day","week","month","year"],
|
||||
metavar="TIME_LIMIT",
|
||||
type=str)
|
||||
|
||||
if arguments == []:
|
||||
return parser.parse_args()
|
||||
else:
|
||||
return parser.parse_args(arguments)
|
||||
|
||||
def checkConflicts():
|
||||
"""Check if command-line arguments are given correcly,
|
||||
if not, raise errors
|
||||
"""
|
||||
|
||||
if GLOBAL.arguments.user is None:
|
||||
user = 0
|
||||
else:
|
||||
user = 1
|
||||
|
||||
search = 1 if GLOBAL.arguments.search else 0
|
||||
|
||||
modes = [
|
||||
"saved","subreddit","submitted","log","link","upvoted","multireddit"
|
||||
]
|
||||
|
||||
values = {
|
||||
x: 0 if getattr(GLOBAL.arguments,x) is None or \
|
||||
getattr(GLOBAL.arguments,x) is False \
|
||||
else 1 \
|
||||
for x in modes
|
||||
}
|
||||
|
||||
if not sum(values[x] for x in values) == 1:
|
||||
raise ProgramModeError("Invalid program mode")
|
||||
|
||||
if search+values["saved"] == 2:
|
||||
raise SearchModeError("You cannot search in your saved posts")
|
||||
|
||||
if search+values["submitted"] == 2:
|
||||
raise SearchModeError("You cannot search in submitted posts")
|
||||
|
||||
if search+values["upvoted"] == 2:
|
||||
raise SearchModeError("You cannot search in upvoted posts")
|
||||
|
||||
if search+values["log"] == 2:
|
||||
raise SearchModeError("You cannot search in log files")
|
||||
|
||||
if values["upvoted"]+values["submitted"] == 1 and user == 0:
|
||||
raise RedditorNameError("No redditor name given")
|
||||
|
||||
class PromptUser:
|
||||
@staticmethod
|
||||
def chooseFrom(choices):
|
||||
print()
|
||||
choicesByIndex = list(str(x) for x in range(len(choices)+1))
|
||||
for i in range(len(choices)):
|
||||
print("{indent}[{order}] {mode}".format(
|
||||
indent=" "*4,order=i+1,mode=choices[i]
|
||||
))
|
||||
print(" "*4+"[0] exit\n")
|
||||
choice = input("> ")
|
||||
while not choice.lower() in choices+choicesByIndex+["exit"]:
|
||||
print("Invalid input\n")
|
||||
programModeIndex = input("> ")
|
||||
|
||||
if choice == "0" or choice == "exit":
|
||||
sys.exit()
|
||||
elif choice in choicesByIndex:
|
||||
return choices[int(choice)-1]
|
||||
else:
|
||||
return choice
|
||||
|
||||
def __init__(self):
|
||||
print("select program mode:")
|
||||
programModes = [
|
||||
"search","subreddit","multireddit",
|
||||
"submitted","upvoted","saved","log"
|
||||
]
|
||||
programMode = self.chooseFrom(programModes)
|
||||
|
||||
if programMode == "search":
|
||||
GLOBAL.arguments.search = input("\nquery: ")
|
||||
GLOBAL.arguments.subreddit = input("\nsubreddit: ")
|
||||
|
||||
print("\nselect sort type:")
|
||||
sortTypes = [
|
||||
"relevance","top","new"
|
||||
]
|
||||
sortType = self.chooseFrom(sortTypes)
|
||||
GLOBAL.arguments.sort = sortType
|
||||
|
||||
print("\nselect time filter:")
|
||||
timeFilters = [
|
||||
"hour","day","week","month","year","all"
|
||||
]
|
||||
timeFilter = self.chooseFrom(timeFilters)
|
||||
GLOBAL.arguments.time = timeFilter
|
||||
|
||||
if programMode == "subreddit":
|
||||
|
||||
subredditInput = input("subreddit (enter frontpage for frontpage): ")
|
||||
GLOBAL.arguments.subreddit = subredditInput
|
||||
|
||||
while not (subredditInput == "" or subredditInput.lower() == "frontpage"):
|
||||
subredditInput = input("subreddit: ")
|
||||
GLOBAL.arguments.subreddit += "+" + subredditInput
|
||||
|
||||
if " " in GLOBAL.arguments.subreddit:
|
||||
GLOBAL.arguments.subreddit = "+".join(GLOBAL.arguments.subreddit.split())
|
||||
|
||||
# DELETE THE PLUS (+) AT THE END
|
||||
if not subredditInput.lower() == "frontpage":
|
||||
GLOBAL.arguments.subreddit = GLOBAL.arguments.subreddit[:-1]
|
||||
|
||||
print("\nselect sort type:")
|
||||
sortTypes = [
|
||||
"hot","top","new","rising","controversial"
|
||||
]
|
||||
sortType = self.chooseFrom(sortTypes)
|
||||
GLOBAL.arguments.sort = sortType
|
||||
|
||||
if sortType in ["top","controversial"]:
|
||||
print("\nselect time filter:")
|
||||
timeFilters = [
|
||||
"hour","day","week","month","year","all"
|
||||
]
|
||||
timeFilter = self.chooseFrom(timeFilters)
|
||||
GLOBAL.arguments.time = timeFilter
|
||||
else:
|
||||
GLOBAL.arguments.time = "all"
|
||||
|
||||
elif programMode == "multireddit":
|
||||
GLOBAL.arguments.user = input("\nredditor: ")
|
||||
GLOBAL.arguments.multireddit = input("\nmultireddit: ")
|
||||
|
||||
print("\nselect sort type:")
|
||||
sortTypes = [
|
||||
"hot","top","new","rising","controversial"
|
||||
]
|
||||
sortType = self.chooseFrom(sortTypes)
|
||||
GLOBAL.arguments.sort = sortType
|
||||
|
||||
if sortType in ["top","controversial"]:
|
||||
print("\nselect time filter:")
|
||||
timeFilters = [
|
||||
"hour","day","week","month","year","all"
|
||||
]
|
||||
timeFilter = self.chooseFrom(timeFilters)
|
||||
GLOBAL.arguments.time = timeFilter
|
||||
else:
|
||||
GLOBAL.arguments.time = "all"
|
||||
|
||||
elif programMode == "submitted":
|
||||
GLOBAL.arguments.submitted = True
|
||||
GLOBAL.arguments.user = input("\nredditor: ")
|
||||
|
||||
print("\nselect sort type:")
|
||||
sortTypes = [
|
||||
"hot","top","new","controversial"
|
||||
]
|
||||
sortType = self.chooseFrom(sortTypes)
|
||||
GLOBAL.arguments.sort = sortType
|
||||
|
||||
if sortType == "top":
|
||||
print("\nselect time filter:")
|
||||
timeFilters = [
|
||||
"hour","day","week","month","year","all"
|
||||
]
|
||||
timeFilter = self.chooseFrom(timeFilters)
|
||||
GLOBAL.arguments.time = timeFilter
|
||||
else:
|
||||
GLOBAL.arguments.time = "all"
|
||||
|
||||
elif programMode == "upvoted":
|
||||
GLOBAL.arguments.upvoted = True
|
||||
GLOBAL.arguments.user = input("\nredditor: ")
|
||||
|
||||
elif programMode == "saved":
|
||||
GLOBAL.arguments.saved = True
|
||||
|
||||
elif programMode == "log":
|
||||
while True:
|
||||
GLOBAL.arguments.log = input("\nlog file directory:")
|
||||
if Path(GLOBAL.arguments.log ).is_file():
|
||||
break
|
||||
while True:
|
||||
try:
|
||||
GLOBAL.arguments.limit = int(input("\nlimit (0 for none): "))
|
||||
if GLOBAL.arguments.limit == 0:
|
||||
GLOBAL.arguments.limit = None
|
||||
break
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
def prepareAttributes():
|
||||
ATTRIBUTES = {}
|
||||
|
||||
if GLOBAL.arguments.user is not None:
|
||||
ATTRIBUTES["user"] = GLOBAL.arguments.user
|
||||
|
||||
if GLOBAL.arguments.search is not None:
|
||||
ATTRIBUTES["search"] = GLOBAL.arguments.search
|
||||
if GLOBAL.arguments.sort == "hot" or \
|
||||
GLOBAL.arguments.sort == "controversial" or \
|
||||
GLOBAL.arguments.sort == "rising":
|
||||
GLOBAL.arguments.sort = "relevance"
|
||||
|
||||
if GLOBAL.arguments.sort is not None:
|
||||
ATTRIBUTES["sort"] = GLOBAL.arguments.sort
|
||||
else:
|
||||
if GLOBAL.arguments.submitted:
|
||||
ATTRIBUTES["sort"] = "new"
|
||||
else:
|
||||
ATTRIBUTES["sort"] = "hot"
|
||||
|
||||
if GLOBAL.arguments.time is not None:
|
||||
ATTRIBUTES["time"] = GLOBAL.arguments.time
|
||||
else:
|
||||
ATTRIBUTES["time"] = "all"
|
||||
|
||||
if GLOBAL.arguments.link is not None:
|
||||
|
||||
GLOBAL.arguments.link = GLOBAL.arguments.link.strip("\"")
|
||||
|
||||
ATTRIBUTES = LinkDesigner(GLOBAL.arguments.link)
|
||||
|
||||
if GLOBAL.arguments.search is not None:
|
||||
ATTRIBUTES["search"] = GLOBAL.arguments.search
|
||||
|
||||
if GLOBAL.arguments.sort is not None:
|
||||
ATTRIBUTES["sort"] = GLOBAL.arguments.sort
|
||||
|
||||
if GLOBAL.arguments.time is not None:
|
||||
ATTRIBUTES["time"] = GLOBAL.arguments.time
|
||||
|
||||
elif GLOBAL.arguments.subreddit is not None:
|
||||
if type(GLOBAL.arguments.subreddit) == list:
|
||||
GLOBAL.arguments.subreddit = "+".join(GLOBAL.arguments.subreddit)
|
||||
|
||||
ATTRIBUTES["subreddit"] = GLOBAL.arguments.subreddit
|
||||
|
||||
elif GLOBAL.arguments.multireddit is not None:
|
||||
ATTRIBUTES["multireddit"] = GLOBAL.arguments.multireddit
|
||||
|
||||
elif GLOBAL.arguments.saved is True:
|
||||
ATTRIBUTES["saved"] = True
|
||||
|
||||
elif GLOBAL.arguments.upvoted is True:
|
||||
ATTRIBUTES["upvoted"] = True
|
||||
|
||||
elif GLOBAL.arguments.submitted is not None:
|
||||
ATTRIBUTES["submitted"] = True
|
||||
|
||||
if GLOBAL.arguments.sort == "rising":
|
||||
raise InvalidSortingType("Invalid sorting type has given")
|
||||
|
||||
ATTRIBUTES["limit"] = GLOBAL.arguments.limit
|
||||
|
||||
return ATTRIBUTES
|
||||
|
||||
def postFromLog(fileName):
|
||||
"""Analyze a log file and return a list of dictionaries containing
|
||||
submissions
|
||||
"""
|
||||
if Path.is_file(Path(fileName)):
|
||||
content = jsonFile(fileName).read()
|
||||
else:
|
||||
print("File not found")
|
||||
sys.exit()
|
||||
|
||||
try:
|
||||
del content["HEADER"]
|
||||
except KeyError:
|
||||
pass
|
||||
|
||||
posts = []
|
||||
|
||||
for post in content:
|
||||
if not content[post][-1]['postType'] == None:
|
||||
posts.append(content[post][-1])
|
||||
|
||||
return posts
|
||||
|
||||
def isPostExists(POST):
|
||||
"""Figure out a file's name and checks if the file already exists"""
|
||||
|
||||
title = nameCorrector(POST['postTitle'])
|
||||
PATH = GLOBAL.directory / POST["postSubreddit"]
|
||||
|
||||
possibleExtensions = [".jpg",".png",".mp4",".gif",".webm",".md"]
|
||||
|
||||
"""If you change the filenames, don't forget to add them here.
|
||||
Please don't remove existing ones
|
||||
"""
|
||||
for extension in possibleExtensions:
|
||||
|
||||
OLD_FILE_PATH = PATH / (
|
||||
title
|
||||
+ "_" + POST['postId']
|
||||
+ extension
|
||||
)
|
||||
FILE_PATH = PATH / (
|
||||
POST["postSubmitter"]
|
||||
+ "_" + title
|
||||
+ "_" + POST['postId']
|
||||
+ extension
|
||||
)
|
||||
|
||||
SHORT_FILE_PATH = PATH / (POST['postId']+extension)
|
||||
|
||||
if OLD_FILE_PATH.exists() or \
|
||||
FILE_PATH.exists() or \
|
||||
SHORT_FILE_PATH.exists():
|
||||
|
||||
return True
|
||||
|
||||
else:
|
||||
return False
|
||||
|
||||
def downloadPost(SUBMISSION):
|
||||
|
||||
"""Download directory is declared here for each file"""
|
||||
directory = GLOBAL.directory / SUBMISSION['postSubreddit']
|
||||
|
||||
global lastRequestTime
|
||||
|
||||
downloaders = {
|
||||
"imgur":Imgur,"gfycat":Gfycat,"erome":Erome,"direct":Direct,"self":Self
|
||||
}
|
||||
|
||||
print()
|
||||
if SUBMISSION['postType'] in downloaders:
|
||||
|
||||
if SUBMISSION['postType'] == "imgur":
|
||||
|
||||
while int(time.time() - lastRequestTime) <= 2:
|
||||
pass
|
||||
|
||||
credit = Imgur.get_credits()
|
||||
|
||||
IMGUR_RESET_TIME = credit['UserReset']-time.time()
|
||||
USER_RESET = ("after " \
|
||||
+ str(int(IMGUR_RESET_TIME/60)) \
|
||||
+ " Minutes " \
|
||||
+ str(int(IMGUR_RESET_TIME%60)) \
|
||||
+ " Seconds")
|
||||
|
||||
if credit['ClientRemaining'] < 25 or credit['UserRemaining'] < 25:
|
||||
printCredit = {"noPrint":False}
|
||||
else:
|
||||
printCredit = {"noPrint":True}
|
||||
|
||||
print(
|
||||
"==> Client: {} - User: {} - Reset {}\n".format(
|
||||
credit['ClientRemaining'],
|
||||
credit['UserRemaining'],
|
||||
USER_RESET
|
||||
),end="",**printCredit
|
||||
)
|
||||
|
||||
if not (credit['UserRemaining'] == 0 or \
|
||||
credit['ClientRemaining'] == 0):
|
||||
|
||||
"""This block of code is needed
|
||||
"""
|
||||
while int(time.time() - lastRequestTime) <= 2:
|
||||
pass
|
||||
|
||||
lastRequestTime = time.time()
|
||||
|
||||
else:
|
||||
if credit['UserRemaining'] == 0:
|
||||
KEYWORD = "user"
|
||||
elif credit['ClientRemaining'] == 0:
|
||||
KEYWORD = "client"
|
||||
|
||||
raise ImgurLimitError('{} LIMIT EXCEEDED\n'.format(KEYWORD.upper()))
|
||||
|
||||
downloaders[SUBMISSION['postType']] (directory,SUBMISSION)
|
||||
|
||||
else:
|
||||
raise NoSuitablePost
|
||||
|
||||
return None
|
||||
|
||||
def download(submissions):
|
||||
"""Analyze list of submissions and call the right function
|
||||
to download each one, catch errors, update the log files
|
||||
"""
|
||||
|
||||
subsLenght = len(submissions)
|
||||
global lastRequestTime
|
||||
lastRequestTime = 0
|
||||
downloadedCount = subsLenght
|
||||
duplicates = 0
|
||||
|
||||
FAILED_FILE = createLogFile("FAILED")
|
||||
|
||||
for i in range(subsLenght):
|
||||
print(f"\n({i+1}/{subsLenght}) – r/{submissions[i]['postSubreddit']}",
|
||||
end="")
|
||||
print(f" – {submissions[i]['postType'].upper()}",end="",noPrint=True)
|
||||
|
||||
if isPostExists(submissions[i]):
|
||||
print(f"\n{nameCorrector(submissions[i]['postTitle'])}")
|
||||
print("It already exists")
|
||||
duplicates += 1
|
||||
downloadedCount -= 1
|
||||
continue
|
||||
|
||||
try:
|
||||
downloadPost(submissions[i])
|
||||
|
||||
except FileAlreadyExistsError:
|
||||
print("It already exists")
|
||||
duplicates += 1
|
||||
downloadedCount -= 1
|
||||
|
||||
except ImgurLoginError:
|
||||
print(
|
||||
"Imgur login failed. \nQuitting the program "\
|
||||
"as unexpected errors might occur."
|
||||
)
|
||||
sys.exit()
|
||||
|
||||
except ImgurLimitError as exception:
|
||||
FAILED_FILE.add({int(i+1):[
|
||||
"{class_name}: {info}".format(
|
||||
class_name=exception.__class__.__name__,info=str(exception)
|
||||
),
|
||||
submissions[i]
|
||||
]})
|
||||
downloadedCount -= 1
|
||||
|
||||
except NotADownloadableLinkError as exception:
|
||||
print(
|
||||
"{class_name}: {info}".format(
|
||||
class_name=exception.__class__.__name__,info=str(exception)
|
||||
)
|
||||
)
|
||||
FAILED_FILE.add({int(i+1):[
|
||||
"{class_name}: {info}".format(
|
||||
class_name=exception.__class__.__name__,info=str(exception)
|
||||
),
|
||||
submissions[i]
|
||||
]})
|
||||
downloadedCount -= 1
|
||||
|
||||
except NoSuitablePost:
|
||||
print("No match found, skipping...")
|
||||
downloadedCount -= 1
|
||||
|
||||
except Exception as exception:
|
||||
# raise exception
|
||||
print(
|
||||
"{class_name}: {info}".format(
|
||||
class_name=exception.__class__.__name__,info=str(exception)
|
||||
)
|
||||
)
|
||||
FAILED_FILE.add({int(i+1):[
|
||||
"{class_name}: {info}".format(
|
||||
class_name=exception.__class__.__name__,info=str(exception)
|
||||
),
|
||||
submissions[i]
|
||||
]})
|
||||
downloadedCount -= 1
|
||||
|
||||
if duplicates:
|
||||
print("\n There was {} duplicates".format(duplicates))
|
||||
|
||||
if downloadedCount == 0:
|
||||
print(" Nothing downloaded :(")
|
||||
|
||||
else:
|
||||
print(" Total of {} links downloaded!".format(downloadedCount))
|
||||
|
||||
def main():
|
||||
GLOBAL.arguments = parseArguments()
|
||||
|
||||
if GLOBAL.arguments.directory is not None:
|
||||
GLOBAL.directory = Path(GLOBAL.arguments.directory)
|
||||
else:
|
||||
GLOBAL.directory = Path(input("download directory: "))
|
||||
|
||||
print("\n"," ".join(sys.argv),"\n",noPrint=True)
|
||||
|
||||
try:
|
||||
checkConflicts()
|
||||
except ProgramModeError as err:
|
||||
PromptUser()
|
||||
|
||||
if not Path(GLOBAL.configDirectory).is_dir():
|
||||
os.makedirs(GLOBAL.configDirectory)
|
||||
GLOBAL.config = getConfig("config.json") if Path("config.json").exists() \
|
||||
else getConfig(GLOBAL.configDirectory / "config.json")
|
||||
|
||||
if GLOBAL.arguments.log is not None:
|
||||
logDir = Path(GLOBAL.arguments.log)
|
||||
download(postFromLog(logDir))
|
||||
sys.exit()
|
||||
|
||||
try:
|
||||
POSTS = getPosts(prepareAttributes())
|
||||
except Exception as exc:
|
||||
logging.error(sys.exc_info()[0].__name__,
|
||||
exc_info=full_exc_info(sys.exc_info()))
|
||||
print(log_stream.getvalue(),noPrint=True)
|
||||
print(exc)
|
||||
sys.exit()
|
||||
|
||||
if POSTS is None:
|
||||
print("I could not find any posts in that URL")
|
||||
sys.exit()
|
||||
|
||||
if GLOBAL.arguments.NoDownload:
|
||||
sys.exit()
|
||||
|
||||
else:
|
||||
download(POSTS)
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
log_stream = StringIO()
|
||||
logging.basicConfig(stream=log_stream, level=logging.INFO)
|
||||
|
||||
try:
|
||||
VanillaPrint = print
|
||||
print = printToFile
|
||||
GLOBAL.RUN_TIME = time.time()
|
||||
main()
|
||||
|
||||
except KeyboardInterrupt:
|
||||
if GLOBAL.directory is None:
|
||||
GLOBAL.directory = Path(".\\")
|
||||
|
||||
except Exception as exception:
|
||||
if GLOBAL.directory is None:
|
||||
GLOBAL.directory = Path(".\\")
|
||||
logging.error(sys.exc_info()[0].__name__,
|
||||
exc_info=full_exc_info(sys.exc_info()))
|
||||
print(log_stream.getvalue())
|
||||
|
||||
if not GLOBAL.arguments.quit: input("\nPress enter to quit\n")
|
90
scripts/README.md
Normal file
90
scripts/README.md
Normal file
|
@ -0,0 +1,90 @@
|
|||
# Useful Scripts
|
||||
|
||||
Due to the verboseness of the logs, a great deal of information can be gathered quite easily from the BDFR's logfiles. In this folder, there is a selection of scripts that parse these logs, scraping useful bits of information. Since the logfiles are recurring patterns of strings, it is a fairly simple matter to write scripts that utilise tools included on most Linux systems.
|
||||
|
||||
- [Script to extract all successfully downloaded IDs](#extract-all-successfully-downloaded-ids)
|
||||
- [Script to extract all failed download IDs](#extract-all-failed-ids)
|
||||
- [Timestamp conversion](#converting-bdfrv1-timestamps-to-bdfrv2-timestamps)
|
||||
- [Printing summary statistics for a run](#printing-summary-statistics)
|
||||
- [Unsaving posts from your account after downloading](#unsave-posts-after-downloading)
|
||||
|
||||
## Extract all Successfully Downloaded IDs
|
||||
|
||||
This script is contained [here](extract_successful_ids.sh) and will result in a file that contains the IDs of everything that was successfully downloaded without an error. That is, a list will be created of submissions that, with the `--exclude-id-file` option, can be used so that the BDFR will not attempt to redownload these submissions/comments. This is likely to cause a performance increase, especially when the BDFR run finds many resources.
|
||||
|
||||
The script can be used with the following signature:
|
||||
|
||||
```bash
|
||||
./extract_successful_ids.sh LOGFILE_LOCATION <OUTPUT_FILE>
|
||||
```
|
||||
|
||||
By default, if the second argument is not supplied, the script will write the results to `successful.txt`.
|
||||
|
||||
An example of the script being run on a Linux machine is the following:
|
||||
|
||||
```bash
|
||||
./extract_successful_ids.sh ~/.config/bdfr/log_output.txt
|
||||
```
|
||||
|
||||
## Extract all Failed IDs
|
||||
|
||||
[This script](extract_failed_ids.sh) will output a file of all IDs that failed to be downloaded from the logfile in question. This may be used to prevent subsequent runs of the BDFR from re-attempting those submissions if that is desired, potentially increasing performance.
|
||||
The script can be used with the following signature:
|
||||
|
||||
```bash
|
||||
./extract_failed_ids.sh LOGFILE_LOCATION <OUTPUT_FILE>
|
||||
```
|
||||
|
||||
By default, if the second argument is not supplied, the script will write the results to `failed.txt`.
|
||||
|
||||
An example of the script being run on a Linux machine is the following:
|
||||
|
||||
```bash
|
||||
./extract_failed_ids.sh ~/.config/bdfr/log_output.txt
|
||||
```
|
||||
|
||||
## Converting BDFRv1 Timestamps to BDFRv2 Timestamps
|
||||
|
||||
BDFRv2 uses an internationally recognised and standardised format for timestamps, namely ISO 8601. This is highly recommended due to the nature of using such a widespread and understood standard. However, the BDFRv1 does not use this standard. Due to this, if you've used the old timestamp in filenames or folders, the BDFR will no longer recognise them as the same file and potentially redownload duplicate resources.
|
||||
|
||||
To prevent this, it is recommended that you rename existing files to ISO 8601 standard. This can be done using the [timestamp-converter](https://github.com/Serene-Arc/timestamp-converter) tool made for this purpose. Instructions specifically for the BDFR are available in that project.
|
||||
|
||||
## Printing Summary Statistics
|
||||
|
||||
A simple script has been included to print sumamry statistics for a run of the BDFR. This is mainly to showcase how easy it is to extract statistics from the logfiles. You can extend this quite easily. For example, you can print how often the Imgur module is used, or how many 404 errors there are in the last run, or which module has caused the most errors. The possibilities really are endless.
|
||||
|
||||
```bash
|
||||
./print_summary.sh LOGFILE_LOCATION
|
||||
```
|
||||
|
||||
This will create an output like the following:
|
||||
|
||||
```text
|
||||
Downloaded submissions: 250
|
||||
Failed downloads: 103
|
||||
Files already downloaded: 20073
|
||||
Hard linked submissions: 30
|
||||
Excluded submissions: 1146
|
||||
Files with existing hash skipped: 0
|
||||
Submissions from excluded subreddits: 0
|
||||
```
|
||||
|
||||
## Unsave Posts After Downloading
|
||||
|
||||
[This script](unsaveposts.py) takes a list of submission IDs from a file named `successfulids` created with the `extract_successful_ids.sh` script and unsaves them from your account. To make it work you will need to make a user script in your reddit profile like this:
|
||||
- Fill in the username and password fields in the script. Make sure you keep the quotes around the fields.
|
||||
- Go to https://old.reddit.com/prefs/apps/
|
||||
- Click on `Develop an app` at the bottom.
|
||||
- Make sure you select a `script` not a `web app`.
|
||||
- Name it `Unsave Posts`.
|
||||
- Fill in the `Redirect URI` field with `127.0.0.0`.
|
||||
- Save it.
|
||||
- Fill in the `client_id` and `client_secret` fields on the script. The client ID is the 14 character string under the name you gave your script. .It'll look like a bunch of random characters like this: pspYLwDoci9z_A. The client secret is the longer string next to "secret". Again keep the quotes around the fields.
|
||||
|
||||
Now the script is ready tu run. Just execute it like this:
|
||||
|
||||
```bash
|
||||
python3.9 -m bdfr download DOWNLOAD_DIR --authenticate --user me --saved --log LOGFILE_LOCATION
|
||||
./extract_successful_ids.sh LOGFILE_LOCATION > successfulids
|
||||
./unsaveposts.py
|
||||
```
|
21
scripts/extract_failed_ids.ps1
Normal file
21
scripts/extract_failed_ids.ps1
Normal file
|
@ -0,0 +1,21 @@
|
|||
if (Test-Path -Path $args[0] -PathType Leaf) {
|
||||
$file=$args[0]
|
||||
}
|
||||
else {
|
||||
Write-Host "CANNOT FIND LOG FILE"
|
||||
Exit 1
|
||||
}
|
||||
|
||||
if ($null -ne $args[1]) {
|
||||
$output=$args[1]
|
||||
Write-Host "Outputting IDs to $output"
|
||||
}
|
||||
else {
|
||||
$output="./failed.txt"
|
||||
}
|
||||
|
||||
Select-String -Path $file -Pattern "Could not download submission" | ForEach-Object { -split $_.Line | Select-Object -Skip 11 | Select-Object -First 1 } | ForEach-Object { $_.substring(0,$_.Length-1) } >> $output
|
||||
Select-String -Path $file -Pattern "Failed to download resource" | ForEach-Object { -split $_.Line | Select-Object -Skip 14 | Select-Object -First 1 } >> $output
|
||||
Select-String -Path $file -Pattern "failed to download submission" | ForEach-Object { -split $_.Line | Select-Object -Skip 13 | Select-Object -First 1 } | ForEach-Object { $_.substring(0,$_.Length-1) } >> $output
|
||||
Select-String -Path $file -Pattern "Failed to write file" | ForEach-Object { -split $_.Line | Select-Object -Skip 13 | Select-Object -First 1 } >> $output
|
||||
Select-String -Path $file -Pattern "skipped due to disabled module" | ForEach-Object { -split $_.Line | Select-Object -Skip 8 | Select-Object -First 1 } >> $output
|
16
scripts/extract_failed_ids.sh
Executable file
16
scripts/extract_failed_ids.sh
Executable file
|
@ -0,0 +1,16 @@
|
|||
#!/bin/bash
|
||||
|
||||
if [ -e "$1" ]; then
|
||||
file="$1"
|
||||
else
|
||||
echo 'CANNOT FIND LOG FILE'
|
||||
exit 1
|
||||
fi
|
||||
|
||||
{
|
||||
grep 'Could not download submission' "$file" | awk '{ print $12 }' | rev | cut -c 2- | rev ;
|
||||
grep 'Failed to download resource' "$file" | awk '{ print $15 }' ;
|
||||
grep 'failed to download submission' "$file" | awk '{ print $14 }' | rev | cut -c 2- | rev ;
|
||||
grep 'Failed to write file' "$file" | awk '{ print $14 }' ;
|
||||
grep 'skipped due to disabled module' "$file" | awk '{ print $9 }' ;
|
||||
}
|
21
scripts/extract_successful_ids.ps1
Normal file
21
scripts/extract_successful_ids.ps1
Normal file
|
@ -0,0 +1,21 @@
|
|||
if (Test-Path -Path $args[0] -PathType Leaf) {
|
||||
$file=$args[0]
|
||||
}
|
||||
else {
|
||||
Write-Host "CANNOT FIND LOG FILE"
|
||||
Exit 1
|
||||
}
|
||||
|
||||
if ($null -ne $args[1]) {
|
||||
$output=$args[1]
|
||||
Write-Host "Outputting IDs to $output"
|
||||
}
|
||||
else {
|
||||
$output="./successful.txt"
|
||||
}
|
||||
|
||||
Select-String -Path $file -Pattern "Downloaded submission" | ForEach-Object { -split $_.Line | Select-Object -Last 3 | Select-Object -SkipLast 2 } >> $output
|
||||
Select-String -Path $file -Pattern "Resource hash" | ForEach-Object { -split $_.Line | Select-Object -Last 3 | Select-Object -SkipLast 2 } >> $output
|
||||
Select-String -Path $file -Pattern "Download filter" | ForEach-Object { -split $_.Line | Select-Object -Last 4 | Select-Object -SkipLast 3 } >> $output
|
||||
Select-String -Path $file -Pattern "already exists, continuing" | ForEach-Object { -split $_.Line | Select-Object -Last 4 | Select-Object -SkipLast 3 } >> $output
|
||||
Select-String -Path $file -Pattern "Hard link made" | ForEach-Object { -split $_.Line | Select-Object -Last 1 } >> $output
|
17
scripts/extract_successful_ids.sh
Executable file
17
scripts/extract_successful_ids.sh
Executable file
|
@ -0,0 +1,17 @@
|
|||
#!/bin/bash
|
||||
|
||||
if [ -e "$1" ]; then
|
||||
file="$1"
|
||||
else
|
||||
echo 'CANNOT FIND LOG FILE'
|
||||
exit 1
|
||||
fi
|
||||
|
||||
{
|
||||
grep 'Downloaded submission' "$file" | awk '{ print $(NF-2) }' ;
|
||||
grep 'Resource hash' "$file" | awk '{ print $(NF-2) }' ;
|
||||
grep 'Download filter' "$file" | awk '{ print $(NF-3) }' ;
|
||||
grep 'already exists, continuing' "$file" | awk '{ print $(NF-3) }' ;
|
||||
grep 'Hard link made' "$file" | awk '{ print $(NF) }' ;
|
||||
grep 'filtered due to score' "$file" | awk '{ print $9 }'
|
||||
}
|
30
scripts/print_summary.ps1
Normal file
30
scripts/print_summary.ps1
Normal file
|
@ -0,0 +1,30 @@
|
|||
if (Test-Path -Path $args[0] -PathType Leaf) {
|
||||
$file=$args[0]
|
||||
}
|
||||
else {
|
||||
Write-Host "CANNOT FIND LOG FILE"
|
||||
Exit 1
|
||||
}
|
||||
|
||||
if ($null -ne $args[1]) {
|
||||
$output=$args[1]
|
||||
Write-Host "Outputting IDs to $output"
|
||||
}
|
||||
else {
|
||||
$output="./successful.txt"
|
||||
}
|
||||
|
||||
Write-Host -NoNewline "Downloaded submissions: "
|
||||
Write-Host (Select-String -Path $file -Pattern "Downloaded submission" -AllMatches).Matches.Count
|
||||
Write-Host -NoNewline "Failed downloads: "
|
||||
Write-Host (Select-String -Path $file -Pattern "failed to download submission" -AllMatches).Matches.Count
|
||||
Write-Host -NoNewline "Files already downloaded: "
|
||||
Write-Host (Select-String -Path $file -Pattern "already exists, continuing" -AllMatches).Matches.Count
|
||||
Write-Host -NoNewline "Hard linked submissions: "
|
||||
Write-Host (Select-String -Path $file -Pattern "Hard link made" -AllMatches).Matches.Count
|
||||
Write-Host -NoNewline "Excluded submissions: "
|
||||
Write-Host (Select-String -Path $file -Pattern "in exclusion list" -AllMatches).Matches.Count
|
||||
Write-Host -NoNewline "Files with existing hash skipped: "
|
||||
Write-Host (Select-String -Path $file -Pattern "downloaded elsewhere" -AllMatches).Matches.Count
|
||||
Write-Host -NoNewline "Submissions from excluded subreddits: "
|
||||
Write-Host (Select-String -Path $file -Pattern "in skip list" -AllMatches).Matches.Count
|
16
scripts/print_summary.sh
Executable file
16
scripts/print_summary.sh
Executable file
|
@ -0,0 +1,16 @@
|
|||
#!/bin/bash
|
||||
|
||||
if [ -e "$1" ]; then
|
||||
file="$1"
|
||||
else
|
||||
echo 'CANNOT FIND LOG FILE'
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "Downloaded submissions: $( grep -c 'Downloaded submission' "$file" )"
|
||||
echo "Failed downloads: $( grep -c 'failed to download submission' "$file" )"
|
||||
echo "Files already downloaded: $( grep -c 'already exists, continuing' "$file" )"
|
||||
echo "Hard linked submissions: $( grep -c 'Hard link made' "$file" )"
|
||||
echo "Excluded submissions: $( grep -c 'in exclusion list' "$file" )"
|
||||
echo "Files with existing hash skipped: $( grep -c 'downloaded elsewhere' "$file" )"
|
||||
echo "Submissions from excluded subreddits: $( grep -c 'in skip list' "$file" )"
|
13
scripts/tests/README.md
Normal file
13
scripts/tests/README.md
Normal file
|
@ -0,0 +1,13 @@
|
|||
# Bash Scripts Testing
|
||||
|
||||
The `bats` framework is included and used to test the scripts included, specifically the scripts designed to parse through the logging output. As this involves delicate regex and indexes, it is necessary to test these.
|
||||
|
||||
## Running Tests
|
||||
|
||||
Running the tests are easy, and can be done with a single command. Once the working directory is this directory, run the following command.
|
||||
|
||||
```bash
|
||||
./bats/bin/bats *.bats
|
||||
```
|
||||
|
||||
This will run all test files that have the `.bats` suffix.
|
1
scripts/tests/bats
Submodule
1
scripts/tests/bats
Submodule
|
@ -0,0 +1 @@
|
|||
Subproject commit ce5ca2802fabe5dc38393240cd40e20f8928d3b0
|
|
@ -0,0 +1 @@
|
|||
[2021-06-12 12:49:18,452 - bdfr.downloader - DEBUG] - Submission m2601g skipped due to disabled module Direct
|
3
scripts/tests/example_logfiles/failed_no_downloader.txt
Normal file
3
scripts/tests/example_logfiles/failed_no_downloader.txt
Normal file
|
@ -0,0 +1,3 @@
|
|||
[2021-06-12 11:13:35,665 - bdfr.downloader - ERROR] - Could not download submission nxv3ew: No downloader module exists for url https://www.biorxiv.org/content/10.1101/2021.06.11.447961v1?rss=1
|
||||
[2021-06-12 11:14:21,958 - bdfr.downloader - ERROR] - Could not download submission nxv3ek: No downloader module exists for url https://alkossegyedit.hu/termek/pluss-macko-poloval-20cm/?feed_id=34832&_unique_id=60c40a1190ccb&utm_source=Reddit&utm_medium=AEAdmin&utm_campaign=Poster
|
||||
[2021-06-12 11:17:53,456 - bdfr.downloader - ERROR] - Could not download submission nxv3ea: No downloader module exists for url https://www.biorxiv.org/content/10.1101/2021.06.11.448067v1?rss=1
|
1
scripts/tests/example_logfiles/failed_resource_error.txt
Normal file
1
scripts/tests/example_logfiles/failed_resource_error.txt
Normal file
|
@ -0,0 +1 @@
|
|||
[2021-06-12 11:18:25,794 - bdfr.downloader - ERROR] - Failed to download resource https://i.redd.it/61fniokpjq471.jpg in submission nxv3dt with downloader Direct: Unrecoverable error requesting resource: HTTP Code 404
|
|
@ -0,0 +1,2 @@
|
|||
[2021-06-12 08:38:35,657 - bdfr.downloader - ERROR] - Site Gallery failed to download submission nxr7x9: No images found in Reddit gallery
|
||||
[2021-06-12 08:47:22,005 - bdfr.downloader - ERROR] - Site Gallery failed to download submission nxpn0h: Server responded with 503 to https://www.reddit.com/gallery/nxpkvh
|
1
scripts/tests/example_logfiles/failed_write_error.txt
Normal file
1
scripts/tests/example_logfiles/failed_write_error.txt
Normal file
|
@ -0,0 +1 @@
|
|||
[2021-06-09 22:01:04,530 - bdfr.downloader - ERROR] - Failed to write file in submission nnboza to C:\Users\Yoga 14\path\to\output\ThotNetwork\KatieCarmine_I POST A NEW VIDEO ALMOST EVERYDAY AND YOU NEVER HAVE TO PAY EXTRA FOR IT! I want to share my sex life with you! Only $6 per month and you get full access to over 400 videos of me getting fuck_nnboza.mp4: [Errno 2] No such file or directory: 'C:\\Users\\Yoga 14\\path\\to\\output\\ThotNetwork\\KatieCarmine_I POST A NEW VIDEO ALMOST EVERYDAY AND YOU NEVER HAVE TO PAY EXTRA FOR IT! I want to share my sex life with you! Only $6 per month and you get full access to over 400 videos of me getting fuck_nnboza.mp4'
|
|
@ -0,0 +1,3 @@
|
|||
[2021-06-12 08:41:51,464 - bdfr.downloader - DEBUG] - File /media/smaug/private/reddit/tumblr/nxry0l.jpg from submission nxry0l already exists, continuing
|
||||
[2021-06-12 08:41:51,469 - bdfr.downloader - DEBUG] - File /media/smaug/private/reddit/tumblr/nxrlgn.gif from submission nxrlgn already exists, continuing
|
||||
[2021-06-12 08:41:51,472 - bdfr.downloader - DEBUG] - File /media/smaug/private/reddit/tumblr/nxrq9g.png from submission nxrq9g already exists, continuing
|
|
@ -0,0 +1,3 @@
|
|||
[2021-06-10 20:36:48,722 - bdfr.downloader - DEBUG] - Download filter removed nwfirr with URL https://www.youtube.com/watch?v=NVSiX0Tsees
|
||||
[2021-06-12 19:56:36,848 - bdfr.downloader - DEBUG] - Download filter removed nwfgcl with URL https://www.reddit.com/r/MaliciousCompliance/comments/nwfgcl/new_guy_decided_to_play_manager_alright/
|
||||
[2021-06-12 19:56:28,587 - bdfr.downloader - DEBUG] - Download filter removed nxuxjy with URL https://www.reddit.com/r/MaliciousCompliance/comments/nxuxjy/you_want_an_omelette_with_nothing_inside_okay/
|
|
@ -0,0 +1,7 @@
|
|||
[2021-06-12 11:58:53,864 - bdfr.downloader - INFO] - Downloaded submission nxui9y from tumblr
|
||||
[2021-06-12 11:58:56,618 - bdfr.downloader - INFO] - Downloaded submission nxsr4r from tumblr
|
||||
[2021-06-12 11:58:59,026 - bdfr.downloader - INFO] - Downloaded submission nxviir from tumblr
|
||||
[2021-06-12 11:59:00,289 - bdfr.downloader - INFO] - Downloaded submission nxusva from tumblr
|
||||
[2021-06-12 11:59:00,735 - bdfr.downloader - INFO] - Downloaded submission nxvko7 from tumblr
|
||||
[2021-06-12 11:59:01,215 - bdfr.downloader - INFO] - Downloaded submission nxvd63 from tumblr
|
||||
[2021-06-12 11:59:13,891 - bdfr.downloader - INFO] - Downloaded submission nn9cor from tumblr
|
1
scripts/tests/example_logfiles/succeed_hard_link.txt
Normal file
1
scripts/tests/example_logfiles/succeed_hard_link.txt
Normal file
|
@ -0,0 +1 @@
|
|||
[2021-06-11 17:33:02,118 - bdfr.downloader - INFO] - Hard link made linking /media/smaug/private/reddit/tumblr/nwnp2n.jpg to /media/smaug/private/reddit/tumblr/nwskqb.jpg in submission nwnp2n
|
1
scripts/tests/example_logfiles/succeed_resource_hash.txt
Normal file
1
scripts/tests/example_logfiles/succeed_resource_hash.txt
Normal file
|
@ -0,0 +1 @@
|
|||
[2021-06-11 17:33:02,118 - bdfr.downloader - INFO] - Resource hash aaaaaaaaaaaaaaaaaaaaaaa from submission n86jk8 downloaded elsewhere
|
2
scripts/tests/example_logfiles/succeed_score_filter.txt
Normal file
2
scripts/tests/example_logfiles/succeed_score_filter.txt
Normal file
|
@ -0,0 +1,2 @@
|
|||
[2022-07-23 14:04:14,095 - bdfr.downloader - DEBUG] - Submission ljyy27 filtered due to score 15 < [50]
|
||||
[2022-07-23 14:04:14,104 - bdfr.downloader - DEBUG] - Submission ljyy27 filtered due to score 16 > [1]
|
48
scripts/tests/test_extract_failed_ids.bats
Normal file
48
scripts/tests/test_extract_failed_ids.bats
Normal file
|
@ -0,0 +1,48 @@
|
|||
setup() {
|
||||
load ./test_helper/bats-support/load
|
||||
load ./test_helper/bats-assert/load
|
||||
}
|
||||
|
||||
teardown() {
|
||||
rm -f failed.txt
|
||||
}
|
||||
|
||||
@test "fail run no logfile" {
|
||||
run ../extract_failed_ids.sh
|
||||
assert_failure
|
||||
}
|
||||
|
||||
@test "fail no downloader module" {
|
||||
run ../extract_failed_ids.sh ./example_logfiles/failed_no_downloader.txt
|
||||
echo "$output" > failed.txt
|
||||
assert [ "$( wc -l 'failed.txt' | awk '{ print $1 }' )" -eq "3" ];
|
||||
assert [ "$( grep -Ecv '\w{6,7}' 'failed.txt' )" -eq "0" ];
|
||||
}
|
||||
|
||||
@test "fail resource error" {
|
||||
run ../extract_failed_ids.sh ./example_logfiles/failed_resource_error.txt
|
||||
echo "$output" > failed.txt
|
||||
assert [ "$( wc -l 'failed.txt' | awk '{ print $1 }' )" -eq "1" ];
|
||||
assert [ "$( grep -Ecv '\w{6,7}' 'failed.txt' )" -eq "0" ];
|
||||
}
|
||||
|
||||
@test "fail site downloader error" {
|
||||
run ../extract_failed_ids.sh ./example_logfiles/failed_sitedownloader_error.txt
|
||||
echo "$output" > failed.txt
|
||||
assert [ "$( wc -l 'failed.txt' | awk '{ print $1 }' )" -eq "2" ];
|
||||
assert [ "$( grep -Ecv '\w{6,7}' 'failed.txt' )" -eq "0" ];
|
||||
}
|
||||
|
||||
@test "fail failed file write" {
|
||||
run ../extract_failed_ids.sh ./example_logfiles/failed_write_error.txt
|
||||
echo "$output" > failed.txt
|
||||
assert [ "$( wc -l 'failed.txt' | awk '{ print $1 }' )" -eq "1" ];
|
||||
assert [ "$( grep -Ecv '\w{6,7}' 'failed.txt' )" -eq "0" ];
|
||||
}
|
||||
|
||||
@test "fail disabled module" {
|
||||
run ../extract_failed_ids.sh ./example_logfiles/failed_disabled_module.txt
|
||||
echo "$output" > failed.txt
|
||||
assert [ "$( wc -l 'failed.txt' | awk '{ print $1 }' )" -eq "1" ];
|
||||
assert [ "$( grep -Ecv '\w{6,7}' 'failed.txt' )" -eq "0" ];
|
||||
}
|
50
scripts/tests/test_extract_successful_ids.bats
Normal file
50
scripts/tests/test_extract_successful_ids.bats
Normal file
|
@ -0,0 +1,50 @@
|
|||
setup() {
|
||||
load ./test_helper/bats-support/load
|
||||
load ./test_helper/bats-assert/load
|
||||
}
|
||||
|
||||
teardown() {
|
||||
rm -f successful.txt
|
||||
}
|
||||
|
||||
@test "success downloaded submission" {
|
||||
run ../extract_successful_ids.sh ./example_logfiles/succeed_downloaded_submission.txt
|
||||
echo "$output" > successful.txt
|
||||
assert [ "$( wc -l 'successful.txt' | awk '{ print $1 }' )" -eq "7" ];
|
||||
assert [ "$( grep -Ecv '\w{6,7}' 'successful.txt' )" -eq "0" ];
|
||||
}
|
||||
|
||||
@test "success resource hash" {
|
||||
run ../extract_successful_ids.sh ./example_logfiles/succeed_resource_hash.txt
|
||||
echo "$output" > successful.txt
|
||||
assert [ "$( wc -l 'successful.txt' | awk '{ print $1 }' )" -eq "1" ];
|
||||
assert [ "$( grep -Ecv '\w{6,7}' 'successful.txt' )" -eq "0" ];
|
||||
}
|
||||
|
||||
@test "success download filter" {
|
||||
run ../extract_successful_ids.sh ./example_logfiles/succeed_download_filter.txt
|
||||
echo "$output" > successful.txt
|
||||
assert [ "$( wc -l 'successful.txt' | awk '{ print $1 }' )" -eq "3" ];
|
||||
assert [ "$( grep -Ecv '\w{6,7}' 'successful.txt' )" -eq "0" ];
|
||||
}
|
||||
|
||||
@test "success already exists" {
|
||||
run ../extract_successful_ids.sh ./example_logfiles/succeed_already_exists.txt
|
||||
echo "$output" > successful.txt
|
||||
assert [ "$( wc -l 'successful.txt' | awk '{ print $1 }' )" -eq "3" ];
|
||||
assert [ "$( grep -Ecv '\w{6,7}' 'successful.txt' )" -eq "0" ];
|
||||
}
|
||||
|
||||
@test "success hard link" {
|
||||
run ../extract_successful_ids.sh ./example_logfiles/succeed_hard_link.txt
|
||||
echo "$output" > successful.txt
|
||||
assert [ "$( wc -l 'successful.txt' | awk '{ print $1 }' )" -eq "1" ];
|
||||
assert [ "$( grep -Ecv '\w{6,7}' 'successful.txt' )" -eq "0" ];
|
||||
}
|
||||
|
||||
@test "success score filter" {
|
||||
run ../extract_successful_ids.sh ./example_logfiles/succeed_score_filter.txt
|
||||
echo "$output" > successful.txt
|
||||
assert [ "$( wc -l 'successful.txt' | awk '{ print $1 }' )" -eq "2" ];
|
||||
assert [ "$( grep -Ecv '\w{6,7}' 'successful.txt' )" -eq "0" ];
|
||||
}
|
1
scripts/tests/test_helper/bats-assert
Submodule
1
scripts/tests/test_helper/bats-assert
Submodule
|
@ -0,0 +1 @@
|
|||
Subproject commit e0de84e9c011223e7f88b7ccf1c929f4327097ba
|
1
scripts/tests/test_helper/bats-support
Submodule
1
scripts/tests/test_helper/bats-support
Submodule
|
@ -0,0 +1 @@
|
|||
Subproject commit d140a65044b2d6810381935ae7f0c94c7023c8c3
|
40
scripts/unsaveposts.py
Normal file
40
scripts/unsaveposts.py
Normal file
|
@ -0,0 +1,40 @@
|
|||
#! /usr/bin/env python3.9
|
||||
'''
|
||||
This script takes a list of submission IDs from a file named "successfulids" created with the
|
||||
"extract_successful_ids.sh" script and unsaves them from your account. To make it work you must
|
||||
fill in the username and password fields below. Make sure you keep the quotes around the fields.
|
||||
You'll need to make a "user script" in your reddit profile to run this.
|
||||
Go to https://old.reddit.com/prefs/apps/
|
||||
Click on "Develop an app" at the bottom.
|
||||
Make sure you select a "script" not a "web app."
|
||||
Give it a random name. Doesn't matter.
|
||||
You need to fill in the "Redirect URI" field with something so go ahead and put 127.0.0.0 in there.
|
||||
Save it.
|
||||
The client ID is the 14 character string under the name you gave your script.
|
||||
It'll look like a bunch of random characters like this: pspYLwDoci9z_A
|
||||
The client secret is the longer string next to "secret".
|
||||
Replace those two fields below. Again keep the quotes around the fields.
|
||||
'''
|
||||
|
||||
import praw
|
||||
|
||||
try:
|
||||
r= praw.Reddit(
|
||||
client_id="CLIENTID",
|
||||
client_secret="CLIENTSECRET",
|
||||
password="USERPASSWORD",
|
||||
user_agent="Unsave Posts",
|
||||
username="USERNAME",
|
||||
)
|
||||
|
||||
with open("successfulids", "r") as f:
|
||||
for item in f:
|
||||
r.submission(id = item.strip()).unsave()
|
||||
|
||||
except:
|
||||
print("Something went wrong. Did you install PRAW? Did you change the user login fields?")
|
||||
|
||||
|
||||
else:
|
||||
print("Done! Thanks for playing!")
|
||||
|
50
setup.py
50
setup.py
|
@ -1,50 +0,0 @@
|
|||
#!C:\Users\Ali\AppData\Local\Programs\Python\Python36\python.exe
|
||||
|
||||
## python setup.py build
|
||||
import sys
|
||||
from cx_Freeze import setup, Executable
|
||||
from script import __version__
|
||||
|
||||
options = {
|
||||
"build_exe": {
|
||||
"packages":[
|
||||
"idna","imgurpython", "praw", "requests"
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
if sys.platform == "win32":
|
||||
executables = [Executable(
|
||||
"script.py",
|
||||
targetName="bulk-downloader-for-reddit.exe",
|
||||
shortcutName="Bulk Downloader for Reddit",
|
||||
shortcutDir="DesktopFolder"
|
||||
)]
|
||||
|
||||
elif sys.platform == "linux":
|
||||
executables = [Executable(
|
||||
"script.py",
|
||||
targetName="bulk-downloader-for-reddit",
|
||||
shortcutName="Bulk Downloader for Reddit",
|
||||
shortcutDir="DesktopFolder"
|
||||
)]
|
||||
|
||||
setup(
|
||||
name = "Bulk Downloader for Reddit",
|
||||
version = __version__,
|
||||
description = "Bulk Downloader for Reddit",
|
||||
author = "Ali Parlakci",
|
||||
author_email="parlakciali@gmail.com",
|
||||
url="https://github.com/aliparlakci/bulk-downloader-for-reddit",
|
||||
classifiers=(
|
||||
"Programming Language :: Python :: 3",
|
||||
"License :: OSI Approved :: GNU General Public License v3 (GPLv3)"
|
||||
"Natural Language :: English",
|
||||
"Environment :: Console",
|
||||
"Operating System :: OS Independent",
|
||||
),
|
||||
executables = executables,
|
||||
options = options
|
||||
)
|
||||
|
||||
|
|
@ -1,537 +0,0 @@
|
|||
import io
|
||||
import os
|
||||
import sys
|
||||
import urllib.request
|
||||
from html.parser import HTMLParser
|
||||
from pathlib import Path
|
||||
from urllib.error import HTTPError
|
||||
|
||||
import imgurpython
|
||||
from multiprocessing import Queue
|
||||
|
||||
from src.errors import (AlbumNotDownloadedCompletely, FileAlreadyExistsError,
|
||||
FileNameTooLong, ImgurLoginError,
|
||||
NotADownloadableLinkError)
|
||||
from src.tools import GLOBAL, nameCorrector, printToFile
|
||||
|
||||
VanillaPrint = print
|
||||
print = printToFile
|
||||
|
||||
def dlProgress(count, blockSize, totalSize):
|
||||
"""Function for writing download progress to console
|
||||
"""
|
||||
|
||||
downloadedMbs = int(count*blockSize*(10**(-6)))
|
||||
fileSize = int(totalSize*(10**(-6)))
|
||||
sys.stdout.write("{}Mb/{}Mb\r".format(downloadedMbs,fileSize))
|
||||
sys.stdout.flush()
|
||||
|
||||
def getExtension(link):
|
||||
"""Extract file extension from image link.
|
||||
If didn't find any, return '.jpg'
|
||||
"""
|
||||
|
||||
imageTypes = ['jpg','png','mp4','webm','gif']
|
||||
parsed = link.split('.')
|
||||
for TYPE in imageTypes:
|
||||
if TYPE in parsed:
|
||||
return "."+parsed[-1]
|
||||
else:
|
||||
if not "v.redd.it" in link:
|
||||
return '.jpg'
|
||||
else:
|
||||
return '.mp4'
|
||||
|
||||
def getFile(fileDir,tempDir,imageURL,indent=0):
|
||||
"""Downloads given file to given directory.
|
||||
|
||||
fileDir -- Full file directory
|
||||
tempDir -- Full file directory with the extension of '.tmp'
|
||||
imageURL -- URL to the file to be downloaded
|
||||
|
||||
redditID -- Post's reddit id if renaming the file is necessary.
|
||||
As too long file names seem not working.
|
||||
"""
|
||||
|
||||
headers = [
|
||||
("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " \
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 "\
|
||||
"Safari/537.36 OPR/54.0.2952.64"),
|
||||
("Accept", "text/html,application/xhtml+xml,application/xml;" \
|
||||
"q=0.9,image/webp,image/apng,*/*;q=0.8"),
|
||||
("Accept-Charset", "ISO-8859-1,utf-8;q=0.7,*;q=0.3"),
|
||||
("Accept-Encoding", "none"),
|
||||
("Accept-Language", "en-US,en;q=0.8"),
|
||||
("Connection", "keep-alive")
|
||||
]
|
||||
|
||||
opener = urllib.request.build_opener()
|
||||
opener.addheaders = headers
|
||||
if not "imgur" in imageURL: urllib.request.install_opener(opener)
|
||||
|
||||
if not (os.path.isfile(fileDir)):
|
||||
for i in range(3):
|
||||
try:
|
||||
urllib.request.urlretrieve(imageURL,
|
||||
tempDir,
|
||||
reporthook=dlProgress)
|
||||
os.rename(tempDir,fileDir)
|
||||
except ConnectionResetError as exception:
|
||||
print(" "*indent + str(exception))
|
||||
print(" "*indent + "Trying again\n")
|
||||
except FileNotFoundError:
|
||||
raise FileNameTooLong
|
||||
else:
|
||||
print(" "*indent+"Downloaded"+" "*10)
|
||||
break
|
||||
else:
|
||||
raise FileAlreadyExistsError
|
||||
|
||||
class Erome:
|
||||
def __init__(self,directory,post):
|
||||
try:
|
||||
IMAGES = self.getLinks(post['postURL'])
|
||||
except urllib.error.HTTPError:
|
||||
raise NotADownloadableLinkError("Not a downloadable link")
|
||||
|
||||
imagesLenght = len(IMAGES)
|
||||
howManyDownloaded = imagesLenght
|
||||
duplicates = 0
|
||||
|
||||
if imagesLenght == 1:
|
||||
|
||||
extension = getExtension(IMAGES[0])
|
||||
|
||||
"""Filenames are declared here"""
|
||||
|
||||
title = nameCorrector(post['postTitle'])
|
||||
print(post["postSubmitter"]+"_"+title+"_"+post['postId']+extension)
|
||||
|
||||
fileDir = directory / (
|
||||
post["postSubmitter"]+"_"+title+"_"+post['postId']+extension
|
||||
)
|
||||
tempDir = directory / (
|
||||
post["postSubmitter"]+"_"+title+"_"+post['postId']+".tmp"
|
||||
)
|
||||
|
||||
imageURL = "https:" + IMAGES[0]
|
||||
|
||||
try:
|
||||
getFile(fileDir,tempDir,imageURL)
|
||||
except FileNameTooLong:
|
||||
fileDir = directory / (post['postId'] + extension)
|
||||
tempDir = directory / (post['postId'] + '.tmp')
|
||||
getFile(fileDir,tempDir,imageURL)
|
||||
|
||||
else:
|
||||
title = nameCorrector(post['postTitle'])
|
||||
print(post["postSubmitter"]+"_"+title+"_"+post['postId'],end="\n\n")
|
||||
|
||||
folderDir = directory / (
|
||||
post["postSubmitter"] + "_" + title + "_" + post['postId']
|
||||
)
|
||||
|
||||
try:
|
||||
if not os.path.exists(folderDir):
|
||||
os.makedirs(folderDir)
|
||||
except FileNotFoundError:
|
||||
folderDir = directory / post['postId']
|
||||
os.makedirs(folderDir)
|
||||
|
||||
for i in range(imagesLenght):
|
||||
|
||||
extension = getExtension(IMAGES[i])
|
||||
|
||||
fileName = str(i+1)
|
||||
imageURL = "https:" + IMAGES[i]
|
||||
|
||||
fileDir = folderDir / (fileName + extension)
|
||||
tempDir = folderDir / (fileName + ".tmp")
|
||||
|
||||
print(" ({}/{})".format(i+1,imagesLenght))
|
||||
print(" {}".format(fileName+extension))
|
||||
|
||||
try:
|
||||
getFile(fileDir,tempDir,imageURL,indent=2)
|
||||
print()
|
||||
except FileAlreadyExistsError:
|
||||
print(" The file already exists" + " "*10,end="\n\n")
|
||||
duplicates += 1
|
||||
howManyDownloaded -= 1
|
||||
|
||||
except Exception as exception:
|
||||
# raise exception
|
||||
print("\n Could not get the file")
|
||||
print(
|
||||
" "
|
||||
+ "{class_name}: {info}".format(
|
||||
class_name=exception.__class__.__name__,
|
||||
info=str(exception)
|
||||
)
|
||||
+ "\n"
|
||||
)
|
||||
exceptionType = exception
|
||||
howManyDownloaded -= 1
|
||||
|
||||
if duplicates == imagesLenght:
|
||||
raise FileAlreadyExistsError
|
||||
elif howManyDownloaded + duplicates < imagesLenght:
|
||||
raise AlbumNotDownloadedCompletely(
|
||||
"Album Not Downloaded Completely"
|
||||
)
|
||||
|
||||
def getLinks(self,url,lineNumber=129):
|
||||
|
||||
content = []
|
||||
lineNumber = None
|
||||
|
||||
class EromeParser(HTMLParser):
|
||||
tag = None
|
||||
def handle_starttag(self, tag, attrs):
|
||||
self.tag = {tag:{attr[0]: attr[1] for attr in attrs}}
|
||||
|
||||
pageSource = (urllib.request.urlopen(url).read().decode().split('\n'))
|
||||
|
||||
""" FIND WHERE ALBUM STARTS IN ORDER NOT TO GET WRONG LINKS"""
|
||||
for i in range(len(pageSource)):
|
||||
obj = EromeParser()
|
||||
obj.feed(pageSource[i])
|
||||
tag = obj.tag
|
||||
|
||||
if tag is not None:
|
||||
if "div" in tag:
|
||||
if "id" in tag["div"]:
|
||||
if tag["div"]["id"] == "album":
|
||||
lineNumber = i
|
||||
break
|
||||
|
||||
for line in pageSource[lineNumber:]:
|
||||
obj = EromeParser()
|
||||
obj.feed(line)
|
||||
tag = obj.tag
|
||||
if tag is not None:
|
||||
if "img" in tag:
|
||||
if "class" in tag["img"]:
|
||||
if tag["img"]["class"]=="img-front":
|
||||
content.append(tag["img"]["src"])
|
||||
elif "source" in tag:
|
||||
content.append(tag["source"]["src"])
|
||||
|
||||
return [
|
||||
link for link in content \
|
||||
if link.endswith("_480p.mp4") or not link.endswith(".mp4")
|
||||
]
|
||||
|
||||
class Imgur:
|
||||
def __init__(self,directory,post):
|
||||
self.imgurClient = self.initImgur()
|
||||
|
||||
imgurID = self.getId(post['postURL'])
|
||||
content = self.getLink(imgurID)
|
||||
|
||||
if not os.path.exists(directory): os.makedirs(directory)
|
||||
|
||||
if content['type'] == 'image':
|
||||
|
||||
try:
|
||||
post['mediaURL'] = content['object'].mp4
|
||||
except AttributeError:
|
||||
post['mediaURL'] = content['object'].link
|
||||
|
||||
post['postExt'] = getExtension(post['mediaURL'])
|
||||
|
||||
title = nameCorrector(post['postTitle'])
|
||||
|
||||
"""Filenames are declared here"""
|
||||
|
||||
print(post["postSubmitter"]+"_"+title+"_"+post['postId']+post['postExt'])
|
||||
|
||||
fileDir = directory / (
|
||||
post["postSubmitter"]
|
||||
+ "_" + title
|
||||
+ "_" + post['postId']
|
||||
+ post['postExt']
|
||||
)
|
||||
|
||||
tempDir = directory / (
|
||||
post["postSubmitter"]
|
||||
+ "_" + title
|
||||
+ "_" + post['postId']
|
||||
+ ".tmp"
|
||||
)
|
||||
|
||||
try:
|
||||
getFile(fileDir,tempDir,post['mediaURL'])
|
||||
except FileNameTooLong:
|
||||
fileDir = directory / post['postId'] + post['postExt']
|
||||
tempDir = directory / post['postId'] + '.tmp'
|
||||
getFile(fileDir,tempDir,post['mediaURL'])
|
||||
|
||||
elif content['type'] == 'album':
|
||||
exceptionType = ""
|
||||
images = content['object'].images
|
||||
imagesLenght = len(images)
|
||||
howManyDownloaded = imagesLenght
|
||||
duplicates = 0
|
||||
|
||||
title = nameCorrector(post['postTitle'])
|
||||
print(post["postSubmitter"]+"_"+title+"_"+post['postId'],end="\n\n")
|
||||
|
||||
folderDir = directory / (
|
||||
post["postSubmitter"] + "_" + title + "_" + post['postId']
|
||||
)
|
||||
|
||||
try:
|
||||
if not os.path.exists(folderDir):
|
||||
os.makedirs(folderDir)
|
||||
except FileNotFoundError:
|
||||
folderDir = directory / post['postId']
|
||||
os.makedirs(folderDir)
|
||||
|
||||
for i in range(imagesLenght):
|
||||
try:
|
||||
imageURL = images[i]['mp4']
|
||||
except KeyError:
|
||||
imageURL = images[i]['link']
|
||||
|
||||
images[i]['Ext'] = getExtension(imageURL)
|
||||
|
||||
fileName = (str(i+1)
|
||||
+ "_"
|
||||
+ nameCorrector(str(images[i]['title']))
|
||||
+ "_"
|
||||
+ images[i]['id'])
|
||||
|
||||
"""Filenames are declared here"""
|
||||
|
||||
fileDir = folderDir / (fileName + images[i]['Ext'])
|
||||
tempDir = folderDir / (fileName + ".tmp")
|
||||
|
||||
print(" ({}/{})".format(i+1,imagesLenght))
|
||||
print(" {}".format(fileName+images[i]['Ext']))
|
||||
|
||||
try:
|
||||
getFile(fileDir,tempDir,imageURL,indent=2)
|
||||
print()
|
||||
except FileAlreadyExistsError:
|
||||
print(" The file already exists" + " "*10,end="\n\n")
|
||||
duplicates += 1
|
||||
howManyDownloaded -= 1
|
||||
|
||||
# IF FILE NAME IS TOO LONG, IT WONT REGISTER
|
||||
except FileNameTooLong:
|
||||
fileName = (str(i+1) + "_" + images[i]['id'])
|
||||
fileDir = folderDir / (fileName + images[i]['Ext'])
|
||||
tempDir = folderDir / (fileName + ".tmp")
|
||||
try:
|
||||
getFile(fileDir,tempDir,imageURL,indent=2)
|
||||
# IF STILL TOO LONG
|
||||
except FileNameTooLong:
|
||||
fileName = str(i+1)
|
||||
fileDir = folderDir / (fileName + images[i]['Ext'])
|
||||
tempDir = folderDir / (fileName + ".tmp")
|
||||
getFile(fileDir,tempDir,imageURL,indent=2)
|
||||
|
||||
except Exception as exception:
|
||||
print("\n Could not get the file")
|
||||
print(
|
||||
" "
|
||||
+ "{class_name}: {info}".format(
|
||||
class_name=exception.__class__.__name__,
|
||||
info=str(exception)
|
||||
)
|
||||
+ "\n"
|
||||
)
|
||||
exceptionType = exception
|
||||
howManyDownloaded -= 1
|
||||
|
||||
if duplicates == imagesLenght:
|
||||
raise FileAlreadyExistsError
|
||||
elif howManyDownloaded + duplicates < imagesLenght:
|
||||
raise AlbumNotDownloadedCompletely(
|
||||
"Album Not Downloaded Completely"
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def initImgur():
|
||||
"""Initialize imgur api"""
|
||||
|
||||
config = GLOBAL.config
|
||||
return imgurpython.ImgurClient(
|
||||
config['imgur_client_id'],
|
||||
config['imgur_client_secret']
|
||||
)
|
||||
def getId(self,submissionURL):
|
||||
"""Extract imgur post id
|
||||
and determine if its a single image or album
|
||||
"""
|
||||
|
||||
domainLenght = len("imgur.com/")
|
||||
if submissionURL[-1] == "/":
|
||||
submissionURL = submissionURL[:-1]
|
||||
|
||||
if "a/" in submissionURL or "gallery/" in submissionURL:
|
||||
albumId = submissionURL.split("/")[-1]
|
||||
return {'id':albumId, 'type':'album'}
|
||||
|
||||
else:
|
||||
url = submissionURL.replace('.','/').split('/')
|
||||
imageId = url[url.index('com')+1]
|
||||
return {'id':imageId, 'type':'image'}
|
||||
|
||||
def getLink(self,identity):
|
||||
"""Request imgur object from imgur api
|
||||
"""
|
||||
|
||||
if identity['type'] == 'image':
|
||||
return {'object':self.imgurClient.get_image(identity['id']),
|
||||
'type':'image'}
|
||||
elif identity['type'] == 'album':
|
||||
return {'object':self.imgurClient.get_album(identity['id']),
|
||||
'type':'album'}
|
||||
@staticmethod
|
||||
def get_credits():
|
||||
return Imgur.initImgur().get_credits()
|
||||
|
||||
class Gfycat:
|
||||
def __init__(self,directory,POST):
|
||||
try:
|
||||
POST['mediaURL'] = self.getLink(POST['postURL'])
|
||||
except IndexError:
|
||||
raise NotADownloadableLinkError("Could not read the page source")
|
||||
except Exception as exception:
|
||||
#debug
|
||||
raise exception
|
||||
raise NotADownloadableLinkError("Could not read the page source")
|
||||
|
||||
POST['postExt'] = getExtension(POST['mediaURL'])
|
||||
|
||||
if not os.path.exists(directory): os.makedirs(directory)
|
||||
title = nameCorrector(POST['postTitle'])
|
||||
|
||||
"""Filenames are declared here"""
|
||||
|
||||
print(POST["postSubmitter"]+"_"+title+"_"+POST['postId']+POST['postExt'])
|
||||
|
||||
fileDir = directory / (
|
||||
POST["postSubmitter"]+"_"+title+"_"+POST['postId']+POST['postExt']
|
||||
)
|
||||
tempDir = directory / (
|
||||
POST["postSubmitter"]+"_"+title+"_"+POST['postId']+".tmp"
|
||||
)
|
||||
|
||||
try:
|
||||
getFile(fileDir,tempDir,POST['mediaURL'])
|
||||
except FileNameTooLong:
|
||||
fileDir = directory / (POST['postId']+POST['postExt'])
|
||||
tempDir = directory / (POST['postId']+".tmp")
|
||||
|
||||
getFile(fileDir,tempDir,POST['mediaURL'])
|
||||
|
||||
def getLink(self, url, query='<source id="mp4Source" src=', lineNumber=105):
|
||||
"""Extract direct link to the video from page's source
|
||||
and return it
|
||||
"""
|
||||
|
||||
if '.webm' in url or '.mp4' in url or '.gif' in url:
|
||||
return url
|
||||
|
||||
if url[-1:] == '/':
|
||||
url = url[:-1]
|
||||
|
||||
url = "https://gfycat.com/" + url.split('/')[-1]
|
||||
|
||||
pageSource = (urllib.request.urlopen(url).read().decode().split('\n'))
|
||||
|
||||
theLine = pageSource[lineNumber]
|
||||
lenght = len(query)
|
||||
link = []
|
||||
|
||||
for i in range(len(theLine)):
|
||||
if theLine[i:i+lenght] == query:
|
||||
cursor = (i+lenght)+1
|
||||
while not theLine[cursor] == '"':
|
||||
link.append(theLine[cursor])
|
||||
cursor += 1
|
||||
break
|
||||
|
||||
if "".join(link) == "":
|
||||
raise NotADownloadableLinkError("Could not read the page source")
|
||||
|
||||
return "".join(link)
|
||||
|
||||
class Direct:
|
||||
def __init__(self,directory,POST):
|
||||
POST['postExt'] = getExtension(POST['postURL'])
|
||||
if not os.path.exists(directory): os.makedirs(directory)
|
||||
title = nameCorrector(POST['postTitle'])
|
||||
|
||||
"""Filenames are declared here"""
|
||||
|
||||
print(POST["postSubmitter"]+"_"+title+"_"+POST['postId']+POST['postExt'])
|
||||
|
||||
fileDir = directory / (
|
||||
POST["postSubmitter"]+"_"+title+"_"+POST['postId']+POST['postExt']
|
||||
)
|
||||
tempDir = directory / (
|
||||
POST["postSubmitter"]+"_"+title+"_"+POST['postId']+".tmp"
|
||||
)
|
||||
|
||||
try:
|
||||
getFile(fileDir,tempDir,POST['postURL'])
|
||||
except FileNameTooLong:
|
||||
fileDir = directory / (POST['postId']+POST['postExt'])
|
||||
tempDir = directory / (POST['postId']+".tmp")
|
||||
|
||||
getFile(fileDir,tempDir,POST['postURL'])
|
||||
|
||||
class Self:
|
||||
def __init__(self,directory,post):
|
||||
if not os.path.exists(directory): os.makedirs(directory)
|
||||
|
||||
title = nameCorrector(post['postTitle'])
|
||||
|
||||
"""Filenames are declared here"""
|
||||
|
||||
print(post["postSubmitter"]+"_"+title+"_"+post['postId']+".md")
|
||||
|
||||
fileDir = directory / (
|
||||
post["postSubmitter"]+"_"+title+"_"+post['postId']+".md"
|
||||
)
|
||||
|
||||
if Path.is_file(fileDir):
|
||||
raise FileAlreadyExistsError
|
||||
|
||||
try:
|
||||
self.writeToFile(fileDir,post)
|
||||
except FileNotFoundError:
|
||||
fileDir = post['postId']+".md"
|
||||
fileDir = directory / fileDir
|
||||
|
||||
self.writeToFile(fileDir,post)
|
||||
|
||||
@staticmethod
|
||||
def writeToFile(directory,post):
|
||||
|
||||
"""Self posts are formatted here"""
|
||||
content = ("## ["
|
||||
+ post["postTitle"]
|
||||
+ "]("
|
||||
+ post["postURL"]
|
||||
+ ")\n"
|
||||
+ post["postContent"]
|
||||
+ "\n\n---\n\n"
|
||||
+ "submitted to [r/"
|
||||
+ post["postSubreddit"]
|
||||
+ "](https://www.reddit.com/r/"
|
||||
+ post["postSubreddit"]
|
||||
+ ") by [u/"
|
||||
+ post["postSubmitter"]
|
||||
+ "](https://www.reddit.com/user/"
|
||||
+ post["postSubmitter"]
|
||||
+ ")")
|
||||
|
||||
with io.open(directory,"w",encoding="utf-8") as FILE:
|
||||
VanillaPrint(content,file=FILE)
|
||||
|
||||
print("Downloaded")
|
|
@ -1,89 +0,0 @@
|
|||
import sys
|
||||
|
||||
class FauxTb(object):
|
||||
def __init__(self, tb_frame, tb_lineno, tb_next):
|
||||
self.tb_frame = tb_frame
|
||||
self.tb_lineno = tb_lineno
|
||||
self.tb_next = tb_next
|
||||
|
||||
def current_stack(skip=0):
|
||||
try: 1/0
|
||||
except ZeroDivisionError:
|
||||
f = sys.exc_info()[2].tb_frame
|
||||
for i in range(skip + 2):
|
||||
f = f.f_back
|
||||
lst = []
|
||||
while f is not None:
|
||||
lst.append((f, f.f_lineno))
|
||||
f = f.f_back
|
||||
return lst
|
||||
|
||||
def extend_traceback(tb, stack):
|
||||
"""Extend traceback with stack info."""
|
||||
head = tb
|
||||
for tb_frame, tb_lineno in stack:
|
||||
head = FauxTb(tb_frame, tb_lineno, head)
|
||||
return head
|
||||
|
||||
def full_exc_info(exc_info):
|
||||
"""Like sys.exc_info, but includes the full traceback."""
|
||||
t, v, tb = exc_info
|
||||
full_tb = extend_traceback(tb, current_stack(1))
|
||||
return t, v, full_tb
|
||||
|
||||
class RedditLoginFailed(Exception):
|
||||
pass
|
||||
|
||||
class ImgurLoginError(Exception):
|
||||
pass
|
||||
|
||||
class FileAlreadyExistsError(Exception):
|
||||
pass
|
||||
|
||||
class NotADownloadableLinkError(Exception):
|
||||
pass
|
||||
|
||||
class AlbumNotDownloadedCompletely(Exception):
|
||||
pass
|
||||
|
||||
class FileNameTooLong(Exception):
|
||||
pass
|
||||
|
||||
class InvalidRedditLink(Exception):
|
||||
pass
|
||||
|
||||
class ProgramModeError(Exception):
|
||||
pass
|
||||
|
||||
class SearchModeError(Exception):
|
||||
pass
|
||||
|
||||
class RedditorNameError(Exception):
|
||||
pass
|
||||
|
||||
class NoMatchingSubmissionFound(Exception):
|
||||
pass
|
||||
|
||||
class NoPrawSupport(Exception):
|
||||
pass
|
||||
|
||||
class NoRedditSupport(Exception):
|
||||
pass
|
||||
|
||||
class MultiredditNotFound(Exception):
|
||||
pass
|
||||
|
||||
class InsufficientPermission(Exception):
|
||||
pass
|
||||
|
||||
class InvalidSortingType(Exception):
|
||||
pass
|
||||
|
||||
class FileNotFoundError(Exception):
|
||||
pass
|
||||
|
||||
class NoSuitablePost(Exception):
|
||||
pass
|
||||
|
||||
class ImgurLimitError(Exception):
|
||||
pass
|
240
src/parser.py
240
src/parser.py
|
@ -1,240 +0,0 @@
|
|||
from pprint import pprint
|
||||
|
||||
try:
|
||||
from src.errors import InvalidRedditLink
|
||||
except ModuleNotFoundError:
|
||||
from errors import InvalidRedditLink
|
||||
|
||||
def QueryParser(PassedQueries,index):
|
||||
ExtractedQueries = {}
|
||||
|
||||
QuestionMarkIndex = PassedQueries.index("?")
|
||||
Header = PassedQueries[:QuestionMarkIndex]
|
||||
ExtractedQueries["HEADER"] = Header
|
||||
Queries = PassedQueries[QuestionMarkIndex+1:]
|
||||
|
||||
ParsedQueries = Queries.split("&")
|
||||
|
||||
for Query in ParsedQueries:
|
||||
Query = Query.split("=")
|
||||
ExtractedQueries[Query[0]] = Query[1]
|
||||
|
||||
if ExtractedQueries["HEADER"] == "search":
|
||||
ExtractedQueries["q"] = ExtractedQueries["q"].replace("%20"," ")
|
||||
|
||||
return ExtractedQueries
|
||||
|
||||
def LinkParser(LINK):
|
||||
RESULT = {}
|
||||
ShortLink = False
|
||||
|
||||
if not "reddit.com" in LINK:
|
||||
raise InvalidRedditLink("Invalid reddit link")
|
||||
|
||||
SplittedLink = LINK.split("/")
|
||||
|
||||
if SplittedLink[0] == "https:" or SplittedLink[0] == "http:":
|
||||
SplittedLink = SplittedLink[2:]
|
||||
|
||||
try:
|
||||
if (SplittedLink[-2].endswith("reddit.com") and \
|
||||
SplittedLink[-1] == "") or \
|
||||
SplittedLink[-1].endswith("reddit.com"):
|
||||
|
||||
RESULT["sort"] = "best"
|
||||
return RESULT
|
||||
except IndexError:
|
||||
if SplittedLink[0].endswith("reddit.com"):
|
||||
RESULT["sort"] = "best"
|
||||
return RESULT
|
||||
|
||||
if "redd.it" in SplittedLink:
|
||||
ShortLink = True
|
||||
|
||||
if SplittedLink[0].endswith("reddit.com"):
|
||||
SplittedLink = SplittedLink[1:]
|
||||
|
||||
if "comments" in SplittedLink:
|
||||
RESULT = {"post":LINK}
|
||||
return RESULT
|
||||
|
||||
elif "me" in SplittedLink or \
|
||||
"u" in SplittedLink or \
|
||||
"user" in SplittedLink or \
|
||||
"r" in SplittedLink or \
|
||||
"m" in SplittedLink:
|
||||
|
||||
if "r" in SplittedLink:
|
||||
RESULT["subreddit"] = SplittedLink[SplittedLink.index("r") + 1]
|
||||
|
||||
elif "m" in SplittedLink:
|
||||
RESULT["multireddit"] = SplittedLink[SplittedLink.index("m") + 1]
|
||||
RESULT["user"] = SplittedLink[SplittedLink.index("m") - 1]
|
||||
|
||||
else:
|
||||
for index in range(len(SplittedLink)):
|
||||
if SplittedLink[index] == "u" or \
|
||||
SplittedLink[index] == "user":
|
||||
|
||||
RESULT["user"] = SplittedLink[index+1]
|
||||
|
||||
elif SplittedLink[index] == "me":
|
||||
RESULT["user"] = "me"
|
||||
|
||||
|
||||
for index in range(len(SplittedLink)):
|
||||
if SplittedLink[index] in [
|
||||
"hot","top","new","controversial","rising"
|
||||
]:
|
||||
|
||||
RESULT["sort"] = SplittedLink[index]
|
||||
|
||||
if index == 0:
|
||||
RESULT["subreddit"] = "frontpage"
|
||||
|
||||
elif SplittedLink[index] in ["submitted","saved","posts","upvoted"]:
|
||||
if SplittedLink[index] == "submitted" or \
|
||||
SplittedLink[index] == "posts":
|
||||
RESULT["submitted"] = {}
|
||||
|
||||
elif SplittedLink[index] == "saved":
|
||||
RESULT["saved"] = True
|
||||
|
||||
elif SplittedLink[index] == "upvoted":
|
||||
RESULT["upvoted"] = True
|
||||
|
||||
elif "?" in SplittedLink[index]:
|
||||
ParsedQuery = QueryParser(SplittedLink[index],index)
|
||||
if ParsedQuery["HEADER"] == "search":
|
||||
del ParsedQuery["HEADER"]
|
||||
RESULT["search"] = ParsedQuery
|
||||
|
||||
elif ParsedQuery["HEADER"] == "submitted" or \
|
||||
ParsedQuery["HEADER"] == "posts":
|
||||
del ParsedQuery["HEADER"]
|
||||
RESULT["submitted"] = ParsedQuery
|
||||
|
||||
else:
|
||||
del ParsedQuery["HEADER"]
|
||||
RESULT["queries"] = ParsedQuery
|
||||
|
||||
if not ("upvoted" in RESULT or \
|
||||
"saved" in RESULT or \
|
||||
"submitted" in RESULT or \
|
||||
"multireddit" in RESULT) and \
|
||||
"user" in RESULT:
|
||||
RESULT["submitted"] = {}
|
||||
|
||||
return RESULT
|
||||
|
||||
def LinkDesigner(LINK):
|
||||
|
||||
attributes = LinkParser(LINK)
|
||||
MODE = {}
|
||||
|
||||
if "post" in attributes:
|
||||
MODE["post"] = attributes["post"]
|
||||
MODE["sort"] = ""
|
||||
MODE["time"] = ""
|
||||
return MODE
|
||||
|
||||
elif "search" in attributes:
|
||||
MODE["search"] = attributes["search"]["q"]
|
||||
|
||||
if "restrict_sr" in attributes["search"]:
|
||||
|
||||
if not (attributes["search"]["restrict_sr"] == 0 or \
|
||||
attributes["search"]["restrict_sr"] == "off" or \
|
||||
attributes["search"]["restrict_sr"] == ""):
|
||||
|
||||
if "subreddit" in attributes:
|
||||
MODE["subreddit"] = attributes["subreddit"]
|
||||
elif "multireddit" in attributes:
|
||||
MODE["multreddit"] = attributes["multireddit"]
|
||||
MODE["user"] = attributes["user"]
|
||||
else:
|
||||
MODE["subreddit"] = "all"
|
||||
else:
|
||||
MODE["subreddit"] = "all"
|
||||
|
||||
if "t" in attributes["search"]:
|
||||
MODE["time"] = attributes["search"]["t"]
|
||||
else:
|
||||
MODE["time"] = "all"
|
||||
|
||||
if "sort" in attributes["search"]:
|
||||
MODE["sort"] = attributes["search"]["sort"]
|
||||
else:
|
||||
MODE["sort"] = "relevance"
|
||||
|
||||
if "include_over_18" in attributes["search"]:
|
||||
if attributes["search"]["include_over_18"] == 1 or \
|
||||
attributes["search"]["include_over_18"] == "on":
|
||||
MODE["nsfw"] = True
|
||||
else:
|
||||
MODE["nsfw"] = False
|
||||
|
||||
else:
|
||||
if "queries" in attributes:
|
||||
if not ("submitted" in attributes or \
|
||||
"posts" in attributes):
|
||||
|
||||
if "t" in attributes["queries"]:
|
||||
MODE["time"] = attributes["queries"]["t"]
|
||||
else:
|
||||
MODE["time"] = "day"
|
||||
else:
|
||||
if "t" in attributes["queries"]:
|
||||
MODE["time"] = attributes["queries"]["t"]
|
||||
else:
|
||||
MODE["time"] = "all"
|
||||
|
||||
if "sort" in attributes["queries"]:
|
||||
MODE["sort"] = attributes["queries"]["sort"]
|
||||
else:
|
||||
MODE["sort"] = "new"
|
||||
else:
|
||||
MODE["time"] = "day"
|
||||
|
||||
if "subreddit" in attributes and not "search" in attributes:
|
||||
MODE["subreddit"] = attributes["subreddit"]
|
||||
|
||||
elif "user" in attributes and not "search" in attributes:
|
||||
MODE["user"] = attributes["user"]
|
||||
|
||||
if "submitted" in attributes:
|
||||
MODE["submitted"] = True
|
||||
if "sort" in attributes["submitted"]:
|
||||
MODE["sort"] = attributes["submitted"]["sort"]
|
||||
elif "sort" in MODE:
|
||||
pass
|
||||
else:
|
||||
MODE["sort"] = "new"
|
||||
|
||||
if "t" in attributes["submitted"]:
|
||||
MODE["time"] = attributes["submitted"]["t"]
|
||||
else:
|
||||
MODE["time"] = "all"
|
||||
|
||||
elif "saved" in attributes:
|
||||
MODE["saved"] = True
|
||||
|
||||
elif "upvoted" in attributes:
|
||||
MODE["upvoted"] = True
|
||||
|
||||
elif "multireddit" in attributes:
|
||||
MODE["multireddit"] = attributes["multireddit"]
|
||||
|
||||
if "sort" in attributes:
|
||||
MODE["sort"] = attributes["sort"]
|
||||
elif "sort" in MODE:
|
||||
pass
|
||||
else:
|
||||
MODE["sort"] = "hot"
|
||||
|
||||
return MODE
|
||||
|
||||
if __name__ == "__main__":
|
||||
while True:
|
||||
link = input("> ")
|
||||
pprint(LinkDesigner(link))
|
482
src/searcher.py
482
src/searcher.py
|
@ -1,482 +0,0 @@
|
|||
import os
|
||||
import sys
|
||||
import random
|
||||
import socket
|
||||
import webbrowser
|
||||
|
||||
import praw
|
||||
from prawcore.exceptions import NotFound, ResponseException, Forbidden
|
||||
|
||||
from src.tools import GLOBAL, createLogFile, jsonFile, printToFile
|
||||
from src.errors import (NoMatchingSubmissionFound, NoPrawSupport,
|
||||
NoRedditSupport, MultiredditNotFound,
|
||||
InvalidSortingType, RedditLoginFailed,
|
||||
InsufficientPermission)
|
||||
|
||||
print = printToFile
|
||||
|
||||
def beginPraw(config,user_agent = str(socket.gethostname())):
|
||||
class GetAuth:
|
||||
def __init__(self,redditInstance,port):
|
||||
self.redditInstance = redditInstance
|
||||
self.PORT = int(port)
|
||||
|
||||
def recieve_connection(self):
|
||||
"""Wait for and then return a connected socket..
|
||||
Opens a TCP connection on port 8080, and waits for a single client.
|
||||
"""
|
||||
server = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
||||
server.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
|
||||
server.bind(('localhost', self.PORT))
|
||||
server.listen(1)
|
||||
client = server.accept()[0]
|
||||
server.close()
|
||||
return client
|
||||
|
||||
def send_message(self, message):
|
||||
"""Send message to client and close the connection."""
|
||||
self.client.send(
|
||||
'HTTP/1.1 200 OK\r\n\r\n{}'.format(message).encode('utf-8')
|
||||
)
|
||||
self.client.close()
|
||||
|
||||
def getRefreshToken(self,*scopes):
|
||||
state = str(random.randint(0, 65000))
|
||||
url = self.redditInstance.auth.url(scopes, state, 'permanent')
|
||||
print("Go to this URL and login to reddit:\n\n",url)
|
||||
webbrowser.open(url,new=2)
|
||||
|
||||
self.client = self.recieve_connection()
|
||||
data = self.client.recv(1024).decode('utf-8')
|
||||
str(data)
|
||||
param_tokens = data.split(' ', 2)[1].split('?', 1)[1].split('&')
|
||||
params = {
|
||||
key: value for (key, value) in [token.split('=') \
|
||||
for token in param_tokens]
|
||||
}
|
||||
if state != params['state']:
|
||||
self.send_message(
|
||||
client, 'State mismatch. Expected: {} Received: {}'
|
||||
.format(state, params['state'])
|
||||
)
|
||||
raise RedditLoginFailed
|
||||
elif 'error' in params:
|
||||
self.send_message(client, params['error'])
|
||||
raise RedditLoginFailed
|
||||
|
||||
refresh_token = self.redditInstance.auth.authorize(params['code'])
|
||||
self.send_message(
|
||||
"<script>" \
|
||||
"alert(\"You can go back to terminal window now.\");" \
|
||||
"</script>"
|
||||
)
|
||||
return (self.redditInstance,refresh_token)
|
||||
|
||||
"""Start reddit instance"""
|
||||
|
||||
scopes = ['identity','history','read']
|
||||
port = "1337"
|
||||
arguments = {
|
||||
"client_id":GLOBAL.reddit_client_id,
|
||||
"client_secret":GLOBAL.reddit_client_secret,
|
||||
"user_agent":user_agent
|
||||
}
|
||||
|
||||
if "reddit_refresh_token" in GLOBAL.config:
|
||||
arguments["refresh_token"] = GLOBAL.config["reddit_refresh_token"]
|
||||
reddit = praw.Reddit(**arguments)
|
||||
try:
|
||||
reddit.auth.scopes()
|
||||
except ResponseException:
|
||||
arguments["redirect_uri"] = "http://localhost:" + str(port)
|
||||
reddit = praw.Reddit(**arguments)
|
||||
authorizedInstance = GetAuth(reddit,port).getRefreshToken(*scopes)
|
||||
reddit = authorizedInstance[0]
|
||||
refresh_token = authorizedInstance[1]
|
||||
jsonFile(GLOBAL.configDirectory / "config.json").add({
|
||||
"reddit_username":str(reddit.user.me()),
|
||||
"reddit_refresh_token":refresh_token
|
||||
})
|
||||
else:
|
||||
arguments["redirect_uri"] = "http://localhost:" + str(port)
|
||||
reddit = praw.Reddit(**arguments)
|
||||
authorizedInstance = GetAuth(reddit,port).getRefreshToken(*scopes)
|
||||
reddit = authorizedInstance[0]
|
||||
refresh_token = authorizedInstance[1]
|
||||
jsonFile(GLOBAL.configDirectory / "config.json").add({
|
||||
"reddit_username":str(reddit.user.me()),
|
||||
"reddit_refresh_token":refresh_token
|
||||
})
|
||||
return reddit
|
||||
|
||||
def getPosts(args):
|
||||
"""Call PRAW regarding to arguments and pass it to redditSearcher.
|
||||
Return what redditSearcher has returned.
|
||||
"""
|
||||
|
||||
config = GLOBAL.config
|
||||
reddit = beginPraw(config)
|
||||
|
||||
if args["sort"] == "best":
|
||||
raise NoPrawSupport("PRAW does not support that")
|
||||
|
||||
if "subreddit" in args:
|
||||
if "search" in args:
|
||||
if args["subreddit"] == "frontpage":
|
||||
args["subreddit"] = "all"
|
||||
|
||||
if "user" in args:
|
||||
if args["user"] == "me":
|
||||
args["user"] = str(reddit.user.me())
|
||||
|
||||
if not "search" in args:
|
||||
if args["sort"] == "top" or args["sort"] == "controversial":
|
||||
keyword_params = {
|
||||
"time_filter":args["time"],
|
||||
"limit":args["limit"]
|
||||
}
|
||||
# OTHER SORT TYPES DON'T TAKE TIME_FILTER
|
||||
else:
|
||||
keyword_params = {
|
||||
"limit":args["limit"]
|
||||
}
|
||||
else:
|
||||
keyword_params = {
|
||||
"time_filter":args["time"],
|
||||
"limit":args["limit"]
|
||||
}
|
||||
|
||||
if "search" in args:
|
||||
if GLOBAL.arguments.sort in ["hot","rising","controversial"]:
|
||||
raise InvalidSortingType("Invalid sorting type has given")
|
||||
|
||||
if "subreddit" in args:
|
||||
print (
|
||||
"search for \"{search}\" in\n" \
|
||||
"subreddit: {subreddit}\nsort: {sort}\n" \
|
||||
"time: {time}\nlimit: {limit}\n".format(
|
||||
search=args["search"],
|
||||
limit=args["limit"],
|
||||
sort=args["sort"],
|
||||
subreddit=args["subreddit"],
|
||||
time=args["time"]
|
||||
).upper(),noPrint=True
|
||||
)
|
||||
return redditSearcher(
|
||||
reddit.subreddit(args["subreddit"]).search(
|
||||
args["search"],
|
||||
limit=args["limit"],
|
||||
sort=args["sort"],
|
||||
time_filter=args["time"]
|
||||
)
|
||||
)
|
||||
|
||||
elif "multireddit" in args:
|
||||
raise NoPrawSupport("PRAW does not support that")
|
||||
|
||||
elif "user" in args:
|
||||
raise NoPrawSupport("PRAW does not support that")
|
||||
|
||||
elif "saved" in args:
|
||||
raise ("Reddit does not support that")
|
||||
|
||||
if args["sort"] == "relevance":
|
||||
raise InvalidSortingType("Invalid sorting type has given")
|
||||
|
||||
if "saved" in args:
|
||||
print(
|
||||
"saved posts\nuser:{username}\nlimit={limit}\n".format(
|
||||
username=reddit.user.me(),
|
||||
limit=args["limit"]
|
||||
).upper(),noPrint=True
|
||||
)
|
||||
return redditSearcher(reddit.user.me().saved(limit=args["limit"]))
|
||||
|
||||
if "subreddit" in args:
|
||||
|
||||
if args["subreddit"] == "frontpage":
|
||||
|
||||
print (
|
||||
"subreddit: {subreddit}\nsort: {sort}\n" \
|
||||
"time: {time}\nlimit: {limit}\n".format(
|
||||
limit=args["limit"],
|
||||
sort=args["sort"],
|
||||
subreddit=args["subreddit"],
|
||||
time=args["time"]
|
||||
).upper(),noPrint=True
|
||||
)
|
||||
return redditSearcher(
|
||||
getattr(reddit.front,args["sort"]) (**keyword_params)
|
||||
)
|
||||
|
||||
else:
|
||||
print (
|
||||
"subreddit: {subreddit}\nsort: {sort}\n" \
|
||||
"time: {time}\nlimit: {limit}\n".format(
|
||||
limit=args["limit"],
|
||||
sort=args["sort"],
|
||||
subreddit=args["subreddit"],
|
||||
time=args["time"]
|
||||
).upper(),noPrint=True
|
||||
)
|
||||
return redditSearcher(
|
||||
getattr(
|
||||
reddit.subreddit(args["subreddit"]),args["sort"]
|
||||
) (**keyword_params)
|
||||
)
|
||||
|
||||
elif "multireddit" in args:
|
||||
print (
|
||||
"user: {user}\n" \
|
||||
"multireddit: {multireddit}\nsort: {sort}\n" \
|
||||
"time: {time}\nlimit: {limit}\n".format(
|
||||
user=args["user"],
|
||||
limit=args["limit"],
|
||||
sort=args["sort"],
|
||||
multireddit=args["multireddit"],
|
||||
time=args["time"]
|
||||
).upper(),noPrint=True
|
||||
)
|
||||
try:
|
||||
return redditSearcher(
|
||||
getattr(
|
||||
reddit.multireddit(
|
||||
args["user"], args["multireddit"]
|
||||
),args["sort"]
|
||||
) (**keyword_params)
|
||||
)
|
||||
except NotFound:
|
||||
raise MultiredditNotFound("Multireddit not found")
|
||||
|
||||
elif "submitted" in args:
|
||||
print (
|
||||
"submitted posts of {user}\nsort: {sort}\n" \
|
||||
"time: {time}\nlimit: {limit}\n".format(
|
||||
limit=args["limit"],
|
||||
sort=args["sort"],
|
||||
user=args["user"],
|
||||
time=args["time"]
|
||||
).upper(),noPrint=True
|
||||
)
|
||||
return redditSearcher(
|
||||
getattr(
|
||||
reddit.redditor(args["user"]).submissions,args["sort"]
|
||||
) (**keyword_params)
|
||||
)
|
||||
|
||||
elif "upvoted" in args:
|
||||
print (
|
||||
"upvoted posts of {user}\nlimit: {limit}\n".format(
|
||||
user=args["user"],
|
||||
limit=args["limit"]
|
||||
).upper(),noPrint=True
|
||||
)
|
||||
try:
|
||||
return redditSearcher(
|
||||
reddit.redditor(args["user"]).upvoted(limit=args["limit"])
|
||||
)
|
||||
except Forbidden:
|
||||
raise InsufficientPermission("You do not have permission to do that")
|
||||
|
||||
elif "post" in args:
|
||||
print("post: {post}\n".format(post=args["post"]).upper(),noPrint=True)
|
||||
return redditSearcher(
|
||||
reddit.submission(url=args["post"]),SINGLE_POST=True
|
||||
)
|
||||
|
||||
def redditSearcher(posts,SINGLE_POST=False):
|
||||
"""Check posts and decide if it can be downloaded.
|
||||
If so, create a dictionary with post details and append them to a list.
|
||||
Write all of posts to file. Return the list
|
||||
"""
|
||||
|
||||
subList = []
|
||||
global subCount
|
||||
subCount = 0
|
||||
global orderCount
|
||||
orderCount = 0
|
||||
global gfycatCount
|
||||
gfycatCount = 0
|
||||
global imgurCount
|
||||
imgurCount = 0
|
||||
global eromeCount
|
||||
eromeCount = 0
|
||||
global directCount
|
||||
directCount = 0
|
||||
global selfCount
|
||||
selfCount = 0
|
||||
|
||||
allPosts = {}
|
||||
|
||||
print("\nGETTING POSTS")
|
||||
if GLOBAL.arguments.verbose: print("\n")
|
||||
postsFile = createLogFile("POSTS")
|
||||
|
||||
if SINGLE_POST:
|
||||
submission = posts
|
||||
subCount += 1
|
||||
try:
|
||||
details = {'postId':submission.id,
|
||||
'postTitle':submission.title,
|
||||
'postSubmitter':str(submission.author),
|
||||
'postType':None,
|
||||
'postURL':submission.url,
|
||||
'postSubreddit':submission.subreddit.display_name}
|
||||
except AttributeError:
|
||||
pass
|
||||
|
||||
result = checkIfMatching(submission)
|
||||
|
||||
if result is not None:
|
||||
details = result
|
||||
orderCount += 1
|
||||
if GLOBAL.arguments.verbose:
|
||||
printSubmission(submission,subCount,orderCount)
|
||||
subList.append(details)
|
||||
|
||||
postsFile.add({subCount:[details]})
|
||||
|
||||
else:
|
||||
try:
|
||||
for submission in posts:
|
||||
subCount += 1
|
||||
|
||||
if subCount % 100 == 0 and not GLOBAL.arguments.verbose:
|
||||
sys.stdout.write("• ")
|
||||
sys.stdout.flush()
|
||||
|
||||
if subCount % 1000 == 0:
|
||||
sys.stdout.write("\n"+" "*14)
|
||||
sys.stdout.flush()
|
||||
|
||||
try:
|
||||
details = {'postId':submission.id,
|
||||
'postTitle':submission.title,
|
||||
'postSubmitter':str(submission.author),
|
||||
'postType':None,
|
||||
'postURL':submission.url,
|
||||
'postSubreddit':submission.subreddit.display_name}
|
||||
except AttributeError:
|
||||
continue
|
||||
|
||||
result = checkIfMatching(submission)
|
||||
|
||||
if result is not None:
|
||||
details = result
|
||||
orderCount += 1
|
||||
if GLOBAL.arguments.verbose:
|
||||
printSubmission(submission,subCount,orderCount)
|
||||
subList.append(details)
|
||||
|
||||
allPosts[subCount] = [details]
|
||||
except KeyboardInterrupt:
|
||||
print("\nKeyboardInterrupt",noPrint=True)
|
||||
|
||||
postsFile.add(allPosts)
|
||||
|
||||
if not len(subList) == 0:
|
||||
if GLOBAL.arguments.NoDownload or GLOBAL.arguments.verbose:
|
||||
print(
|
||||
f"\n\nTotal of {len(subList)} submissions found!"
|
||||
)
|
||||
print(
|
||||
f"{gfycatCount} GFYCATs, {imgurCount} IMGURs, " \
|
||||
f"{eromeCount} EROMEs, {directCount} DIRECTs " \
|
||||
f"and {selfCount} SELF POSTS",noPrint=True
|
||||
)
|
||||
else:
|
||||
print()
|
||||
return subList
|
||||
else:
|
||||
raise NoMatchingSubmissionFound("No matching submission was found")
|
||||
|
||||
def checkIfMatching(submission):
|
||||
global gfycatCount
|
||||
global imgurCount
|
||||
global eromeCount
|
||||
global directCount
|
||||
global selfCount
|
||||
|
||||
try:
|
||||
details = {'postId':submission.id,
|
||||
'postTitle':submission.title,
|
||||
'postSubmitter':str(submission.author),
|
||||
'postType':None,
|
||||
'postURL':submission.url,
|
||||
'postSubreddit':submission.subreddit.display_name}
|
||||
except AttributeError:
|
||||
return None
|
||||
|
||||
if 'gfycat' in submission.domain:
|
||||
details['postType'] = 'gfycat'
|
||||
gfycatCount += 1
|
||||
return details
|
||||
|
||||
elif 'imgur' in submission.domain:
|
||||
details['postType'] = 'imgur'
|
||||
imgurCount += 1
|
||||
return details
|
||||
|
||||
elif 'erome' in submission.domain:
|
||||
details['postType'] = 'erome'
|
||||
eromeCount += 1
|
||||
return details
|
||||
|
||||
elif isDirectLink(submission.url) is not False:
|
||||
details['postType'] = 'direct'
|
||||
details['postURL'] = isDirectLink(submission.url)
|
||||
directCount += 1
|
||||
return details
|
||||
|
||||
elif submission.is_self:
|
||||
details['postType'] = 'self'
|
||||
details['postContent'] = submission.selftext
|
||||
selfCount += 1
|
||||
return details
|
||||
|
||||
def printSubmission(SUB,validNumber,totalNumber):
|
||||
"""Print post's link, title and media link to screen"""
|
||||
|
||||
print(validNumber,end=") ")
|
||||
print(totalNumber,end=" ")
|
||||
print(
|
||||
"https://www.reddit.com/"
|
||||
+"r/"
|
||||
+SUB.subreddit.display_name
|
||||
+"/comments/"
|
||||
+SUB.id
|
||||
)
|
||||
print(" "*(len(str(validNumber))
|
||||
+(len(str(totalNumber)))+3),end="")
|
||||
|
||||
try:
|
||||
print(SUB.title)
|
||||
except:
|
||||
SUB.title = "unnamed"
|
||||
print("SUBMISSION NAME COULD NOT BE READ")
|
||||
pass
|
||||
|
||||
print(" "*(len(str(validNumber))+(len(str(totalNumber)))+3),end="")
|
||||
print(SUB.url,end="\n\n")
|
||||
|
||||
def isDirectLink(URL):
|
||||
"""Check if link is a direct image link.
|
||||
If so, return URL,
|
||||
if not, return False
|
||||
"""
|
||||
|
||||
imageTypes = ['.jpg','.png','.mp4','.webm','.gif']
|
||||
if URL[-1] == "/":
|
||||
URL = URL[:-1]
|
||||
|
||||
if "i.reddituploads.com" in URL:
|
||||
return URL
|
||||
|
||||
elif "v.redd.it" in URL:
|
||||
return URL+"/DASH_600_K"
|
||||
|
||||
for extension in imageTypes:
|
||||
if extension in URL:
|
||||
return URL
|
||||
else:
|
||||
return False
|
148
src/tools.py
148
src/tools.py
|
@ -1,148 +0,0 @@
|
|||
import io
|
||||
import json
|
||||
import sys
|
||||
import time
|
||||
from os import makedirs, path, remove
|
||||
from pathlib import Path
|
||||
|
||||
from src.errors import FileNotFoundError
|
||||
|
||||
class GLOBAL:
|
||||
"""Declare global variables"""
|
||||
|
||||
RUN_TIME = 0
|
||||
config = None
|
||||
arguments = None
|
||||
directory = None
|
||||
configDirectory = Path.home() / "Bulk Downloader for Reddit"
|
||||
reddit_client_id = "BSyphDdxYZAgVQ"
|
||||
reddit_client_secret = "bfqNJaRh8NMh-9eAr-t4TRz-Blk"
|
||||
printVanilla = print
|
||||
|
||||
class jsonFile:
|
||||
""" Write and read JSON files
|
||||
|
||||
Use add(self,toBeAdded) to add to files
|
||||
|
||||
Use delete(self,*deletedKeys) to delete keys
|
||||
"""
|
||||
|
||||
FILEDIR = ""
|
||||
|
||||
def __init__(self,FILEDIR):
|
||||
self.FILEDIR = FILEDIR
|
||||
if not path.exists(self.FILEDIR):
|
||||
self.__writeToFile({},create=True)
|
||||
|
||||
def read(self):
|
||||
with open(self.FILEDIR, 'r') as f:
|
||||
return json.load(f)
|
||||
|
||||
def add(self,toBeAdded):
|
||||
"""Takes a dictionary and merges it with json file.
|
||||
It uses new key's value if a key already exists.
|
||||
Returns the new content as a dictionary.
|
||||
"""
|
||||
|
||||
data = self.read()
|
||||
data = {**data, **toBeAdded}
|
||||
self.__writeToFile(data)
|
||||
return self.read()
|
||||
|
||||
def delete(self,*deleteKeys):
|
||||
"""Delete given keys from JSON file.
|
||||
Returns the new content as a dictionary.
|
||||
"""
|
||||
|
||||
data = self.read()
|
||||
for deleteKey in deleteKeys:
|
||||
if deleteKey in data:
|
||||
del data[deleteKey]
|
||||
found = True
|
||||
if not found:
|
||||
return False
|
||||
self.__writeToFile(data)
|
||||
|
||||
def __writeToFile(self,content,create=False):
|
||||
if not create:
|
||||
remove(self.FILEDIR)
|
||||
with open(self.FILEDIR, 'w') as f:
|
||||
json.dump(content, f, indent=4)
|
||||
|
||||
def createLogFile(TITLE):
|
||||
"""Create a log file with given name
|
||||
inside a folder time stampt in its name and
|
||||
put given arguments inside \"HEADER\" key
|
||||
"""
|
||||
|
||||
folderDirectory = GLOBAL.directory / "LOG_FILES" / \
|
||||
str(time.strftime(
|
||||
"%d-%m-%Y_%H-%M-%S",time.localtime(GLOBAL.RUN_TIME)
|
||||
))
|
||||
logFilename = TITLE.upper()+'.json'
|
||||
|
||||
if not path.exists(folderDirectory):
|
||||
makedirs(folderDirectory)
|
||||
|
||||
FILE = jsonFile(folderDirectory / Path(logFilename))
|
||||
HEADER = " ".join(sys.argv)
|
||||
FILE.add({"HEADER":HEADER})
|
||||
|
||||
return FILE
|
||||
|
||||
def printToFile(*args, noPrint=False,**kwargs):
|
||||
"""Print to both CONSOLE and
|
||||
CONSOLE LOG file in a folder time stampt in the name
|
||||
"""
|
||||
|
||||
TIME = str(time.strftime("%d-%m-%Y_%H-%M-%S",
|
||||
time.localtime(GLOBAL.RUN_TIME)))
|
||||
folderDirectory = GLOBAL.directory / "LOG_FILES" / TIME
|
||||
|
||||
if not noPrint or \
|
||||
GLOBAL.arguments.verbose or \
|
||||
"file" in kwargs:
|
||||
|
||||
print(*args,**kwargs)
|
||||
|
||||
if not path.exists(folderDirectory):
|
||||
makedirs(folderDirectory)
|
||||
|
||||
if not "file" in kwargs:
|
||||
with io.open(
|
||||
folderDirectory / "CONSOLE_LOG.txt","a",encoding="utf-8"
|
||||
) as FILE:
|
||||
print(*args, file=FILE, **kwargs)
|
||||
|
||||
def nameCorrector(string):
|
||||
"""Swap strange characters from given string
|
||||
with underscore (_) and shorten it.
|
||||
Return the string
|
||||
"""
|
||||
|
||||
stringLenght = len(string)
|
||||
if stringLenght > 200:
|
||||
string = string[:200]
|
||||
stringLenght = len(string)
|
||||
spacesRemoved = []
|
||||
|
||||
for b in range(stringLenght):
|
||||
if string[b] == " ":
|
||||
spacesRemoved.append("_")
|
||||
else:
|
||||
spacesRemoved.append(string[b])
|
||||
|
||||
string = ''.join(spacesRemoved)
|
||||
correctedString = []
|
||||
|
||||
if len(string.split('\n')) > 1:
|
||||
string = "".join(string.split('\n'))
|
||||
|
||||
BAD_CHARS = ['\\','/',':','*','?','"','<','>','|','.','#']
|
||||
|
||||
if any(x in string for x in BAD_CHARS):
|
||||
for char in string:
|
||||
if char in BAD_CHARS:
|
||||
string = string.replace(char,"_")
|
||||
|
||||
return string
|
2
tests/__init__.py
Normal file
2
tests/__init__.py
Normal file
|
@ -0,0 +1,2 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
2
tests/archive_entry/__init__.py
Normal file
2
tests/archive_entry/__init__.py
Normal file
|
@ -0,0 +1,2 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
48
tests/archive_entry/test_comment_archive_entry.py
Normal file
48
tests/archive_entry/test_comment_archive_entry.py
Normal file
|
@ -0,0 +1,48 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import praw
|
||||
import pytest
|
||||
|
||||
from bdfr.archive_entry.comment_archive_entry import CommentArchiveEntry
|
||||
|
||||
|
||||
@pytest.mark.online
|
||||
@pytest.mark.reddit
|
||||
@pytest.mark.parametrize(
|
||||
("test_comment_id", "expected_dict"),
|
||||
(
|
||||
(
|
||||
"gstd4hk",
|
||||
{
|
||||
"author": "james_pic",
|
||||
"subreddit": "Python",
|
||||
"submission": "mgi4op",
|
||||
"submission_title": "76% Faster CPython",
|
||||
"distinguished": None,
|
||||
},
|
||||
),
|
||||
),
|
||||
)
|
||||
def test_get_comment_details(test_comment_id: str, expected_dict: dict, reddit_instance: praw.Reddit):
|
||||
comment = reddit_instance.comment(id=test_comment_id)
|
||||
test_entry = CommentArchiveEntry(comment)
|
||||
result = test_entry.compile()
|
||||
assert all([result.get(key) == expected_dict[key] for key in expected_dict.keys()])
|
||||
|
||||
|
||||
@pytest.mark.online
|
||||
@pytest.mark.reddit
|
||||
@pytest.mark.parametrize(
|
||||
("test_comment_id", "expected_min_comments"),
|
||||
(
|
||||
("gstd4hk", 4),
|
||||
("gsvyste", 3),
|
||||
("gsxnvvb", 5),
|
||||
),
|
||||
)
|
||||
def test_get_comment_replies(test_comment_id: str, expected_min_comments: int, reddit_instance: praw.Reddit):
|
||||
comment = reddit_instance.comment(id=test_comment_id)
|
||||
test_entry = CommentArchiveEntry(comment)
|
||||
result = test_entry.compile()
|
||||
assert len(result.get("replies")) >= expected_min_comments
|
Some files were not shown because too many files have changed in this diff Show more
Loading…
Reference in a new issue